1import logging
2import warnings
3from . import settings
4from .psparser import LIT
5from .pdftypes import PDFObjectNotFound
6from .pdftypes import resolve1
7from .pdftypes import int_value
8from .pdftypes import list_value
9from .pdftypes import dict_value
10from .pdfparser import PDFParser
11from .pdfdocument import PDFDocument, PDFTextExtractionNotAllowed
12from .pdfdocument import PDFTextExtractionNotAllowedWarning
13
14
15log = logging.getLogger(__name__)
16
17# some predefined literals and keywords.
18LITERAL_PAGE = LIT('Page')
19LITERAL_PAGES = LIT('Pages')
20
21
22class PDFPage:
23    """An object that holds the information about a page.
24
25    A PDFPage object is merely a convenience class that has a set
26    of keys and values, which describe the properties of a page
27    and point to its contents.
28
29    Attributes:
30      doc: a PDFDocument object.
31      pageid: any Python object that can uniquely identify the page.
32      attrs: a dictionary of page attributes.
33      contents: a list of PDFStream objects that represents the page content.
34      lastmod: the last modified time of the page.
35      resources: a list of resources used by the page.
36      mediabox: the physical size of the page.
37      cropbox: the crop rectangle of the page.
38      rotate: the page rotation (in degree).
39      annots: the page annotations.
40      beads: a chain that represents natural reading order.
41    """
42
43    def __init__(self, doc, pageid, attrs):
44        """Initialize a page object.
45
46        doc: a PDFDocument object.
47        pageid: any Python object that can uniquely identify the page.
48        attrs: a dictionary of page attributes.
49        """
50        self.doc = doc
51        self.pageid = pageid
52        self.attrs = dict_value(attrs)
53        self.lastmod = resolve1(self.attrs.get('LastModified'))
54        self.resources = resolve1(self.attrs.get('Resources', dict()))
55        self.mediabox = resolve1(self.attrs['MediaBox'])
56        if 'CropBox' in self.attrs:
57            self.cropbox = resolve1(self.attrs['CropBox'])
58        else:
59            self.cropbox = self.mediabox
60        self.rotate = (int_value(self.attrs.get('Rotate', 0))+360) % 360
61        self.annots = self.attrs.get('Annots')
62        self.beads = self.attrs.get('B')
63        if 'Contents' in self.attrs:
64            contents = resolve1(self.attrs['Contents'])
65        else:
66            contents = []
67        if not isinstance(contents, list):
68            contents = [contents]
69        self.contents = contents
70        return
71
72    def __repr__(self):
73        return '<PDFPage: Resources={!r}, MediaBox={!r}>'\
74            .format(self.resources, self.mediabox)
75
76    INHERITABLE_ATTRS = {'Resources', 'MediaBox', 'CropBox', 'Rotate'}
77
78    @classmethod
79    def create_pages(cls, document):
80        def search(obj, parent):
81            if isinstance(obj, int):
82                objid = obj
83                tree = dict_value(document.getobj(objid)).copy()
84            else:
85                objid = obj.objid
86                tree = dict_value(obj).copy()
87            for (k, v) in parent.items():
88                if k in cls.INHERITABLE_ATTRS and k not in tree:
89                    tree[k] = v
90
91            tree_type = tree.get('Type')
92            if tree_type is None and not settings.STRICT:  # See #64
93                tree_type = tree.get('type')
94
95            if tree_type is LITERAL_PAGES and 'Kids' in tree:
96                log.info('Pages: Kids=%r', tree['Kids'])
97                for c in list_value(tree['Kids']):
98                    yield from search(c, tree)
99            elif tree_type is LITERAL_PAGE:
100                log.info('Page: %r', tree)
101                yield (objid, tree)
102        pages = False
103        if 'Pages' in document.catalog:
104            objects = search(document.catalog['Pages'], document.catalog)
105            for (objid, tree) in objects:
106                yield cls(document, objid, tree)
107                pages = True
108        if not pages:
109            # fallback when /Pages is missing.
110            for xref in document.xrefs:
111                for objid in xref.get_objids():
112                    try:
113                        obj = document.getobj(objid)
114                        if isinstance(obj, dict) \
115                                and obj.get('Type') is LITERAL_PAGE:
116                            yield cls(document, objid, obj)
117                    except PDFObjectNotFound:
118                        pass
119        return
120
121    @classmethod
122    def get_pages(cls, fp,
123                  pagenos=None, maxpages=0, password='',
124                  caching=True, check_extractable=False):
125        # Create a PDF parser object associated with the file object.
126        parser = PDFParser(fp)
127        # Create a PDF document object that stores the document structure.
128        doc = PDFDocument(parser, password=password, caching=caching)
129        # Check if the document allows text extraction.
130        # If not, warn the user and proceed.
131        if not doc.is_extractable:
132            if check_extractable:
133                error_msg = 'Text extraction is not allowed: %r' % fp
134                raise PDFTextExtractionNotAllowed(error_msg)
135            else:
136                warning_msg = 'The PDF %r contains a metadata field '\
137                            'indicating that it should not allow '   \
138                            'text extraction. Ignoring this field '  \
139                            'and proceeding.' % fp
140                warnings.warn(warning_msg, PDFTextExtractionNotAllowedWarning)
141        # Process each page contained in the document.
142        for (pageno, page) in enumerate(cls.create_pages(doc)):
143            if pagenos and (pageno not in pagenos):
144                continue
145            yield page
146            if maxpages and maxpages <= pageno+1:
147                break
148        return
149