1import logging 2import warnings 3from . import settings 4from .psparser import LIT 5from .pdftypes import PDFObjectNotFound 6from .pdftypes import resolve1 7from .pdftypes import int_value 8from .pdftypes import list_value 9from .pdftypes import dict_value 10from .pdfparser import PDFParser 11from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed 12from .pdfdocument import PDFTextExtractionNotAllowedWarning 13 14 15log = logging.getLogger(__name__) 16 17# some predefined literals and keywords. 18LITERAL_PAGE = LIT('Page') 19LITERAL_PAGES = LIT('Pages') 20 21 22class PDFPage: 23 """An object that holds the information about a page. 24 25 A PDFPage object is merely a convenience class that has a set 26 of keys and values, which describe the properties of a page 27 and point to its contents. 28 29 Attributes: 30 doc: a PDFDocument object. 31 pageid: any Python object that can uniquely identify the page. 32 attrs: a dictionary of page attributes. 33 contents: a list of PDFStream objects that represents the page content. 34 lastmod: the last modified time of the page. 35 resources: a list of resources used by the page. 36 mediabox: the physical size of the page. 37 cropbox: the crop rectangle of the page. 38 rotate: the page rotation (in degree). 39 annots: the page annotations. 40 beads: a chain that represents natural reading order. 41 """ 42 43 def __init__(self, doc, pageid, attrs): 44 """Initialize a page object. 45 46 doc: a PDFDocument object. 47 pageid: any Python object that can uniquely identify the page. 48 attrs: a dictionary of page attributes. 49 """ 50 self.doc = doc 51 self.pageid = pageid 52 self.attrs = dict_value(attrs) 53 self.lastmod = resolve1(self.attrs.get('LastModified')) 54 self.resources = resolve1(self.attrs.get('Resources', dict())) 55 self.mediabox = resolve1(self.attrs['MediaBox']) 56 if 'CropBox' in self.attrs: 57 self.cropbox = resolve1(self.attrs['CropBox']) 58 else: 59 self.cropbox = self.mediabox 60 self.rotate = (int_value(self.attrs.get('Rotate', 0))+360) % 360 61 self.annots = self.attrs.get('Annots') 62 self.beads = self.attrs.get('B') 63 if 'Contents' in self.attrs: 64 contents = resolve1(self.attrs['Contents']) 65 else: 66 contents = [] 67 if not isinstance(contents, list): 68 contents = [contents] 69 self.contents = contents 70 return 71 72 def __repr__(self): 73 return '<PDFPage: Resources={!r}, MediaBox={!r}>'\ 74 .format(self.resources, self.mediabox) 75 76 INHERITABLE_ATTRS = {'Resources', 'MediaBox', 'CropBox', 'Rotate'} 77 78 @classmethod 79 def create_pages(cls, document): 80 def search(obj, parent): 81 if isinstance(obj, int): 82 objid = obj 83 tree = dict_value(document.getobj(objid)).copy() 84 else: 85 objid = obj.objid 86 tree = dict_value(obj).copy() 87 for (k, v) in parent.items(): 88 if k in cls.INHERITABLE_ATTRS and k not in tree: 89 tree[k] = v 90 91 tree_type = tree.get('Type') 92 if tree_type is None and not settings.STRICT: # See #64 93 tree_type = tree.get('type') 94 95 if tree_type is LITERAL_PAGES and 'Kids' in tree: 96 log.info('Pages: Kids=%r', tree['Kids']) 97 for c in list_value(tree['Kids']): 98 yield from search(c, tree) 99 elif tree_type is LITERAL_PAGE: 100 log.info('Page: %r', tree) 101 yield (objid, tree) 102 pages = False 103 if 'Pages' in document.catalog: 104 objects = search(document.catalog['Pages'], document.catalog) 105 for (objid, tree) in objects: 106 yield cls(document, objid, tree) 107 pages = True 108 if not pages: 109 # fallback when /Pages is missing. 110 for xref in document.xrefs: 111 for objid in xref.get_objids(): 112 try: 113 obj = document.getobj(objid) 114 if isinstance(obj, dict) \ 115 and obj.get('Type') is LITERAL_PAGE: 116 yield cls(document, objid, obj) 117 except PDFObjectNotFound: 118 pass 119 return 120 121 @classmethod 122 def get_pages(cls, fp, 123 pagenos=None, maxpages=0, password='', 124 caching=True, check_extractable=False): 125 # Create a PDF parser object associated with the file object. 126 parser = PDFParser(fp) 127 # Create a PDF document object that stores the document structure. 128 doc = PDFDocument(parser, password=password, caching=caching) 129 # Check if the document allows text extraction. 130 # If not, warn the user and proceed. 131 if not doc.is_extractable: 132 if check_extractable: 133 error_msg = 'Text extraction is not allowed: %r' % fp 134 raise PDFTextExtractionNotAllowed(error_msg) 135 else: 136 warning_msg = 'The PDF %r contains a metadata field '\ 137 'indicating that it should not allow ' \ 138 'text extraction. Ignoring this field ' \ 139 'and proceeding.' % fp 140 warnings.warn(warning_msg, PDFTextExtractionNotAllowedWarning) 141 # Process each page contained in the document. 142 for (pageno, page) in enumerate(cls.create_pages(doc)): 143 if pagenos and (pageno not in pagenos): 144 continue 145 yield page 146 if maxpages and maxpages <= pageno+1: 147 break 148 return 149