1import re
2import datetime
3import decimal
4from .generic import PdfObject
5from xml.dom import getDOMImplementation
6from xml.dom.minidom import parseString
7from .utils import u_
8
9RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
10DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"
11XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/"
12PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/"
13XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/"
14
15# What is the PDFX namespace, you might ask?  I might ask that too.  It's
16# a completely undocumented namespace used to place "custom metadata"
17# properties, which are arbitrary metadata properties with no semantic or
18# documented meaning.  Elements in the namespace are key/value-style storage,
19# where the element name is the key and the content is the value.  The keys
20# are transformed into valid XML identifiers by substituting an invalid
21# identifier character with \u2182 followed by the unicode hex ID of the
22# original character.  A key like "my car" is therefore "my\u21820020car".
23#
24# \u2182, in case you're wondering, is the unicode character
25# \u{ROMAN NUMERAL TEN THOUSAND}, a straightforward and obvious choice for
26# escaping characters.
27#
28# Intentional users of the pdfx namespace should be shot on sight.  A
29# custom data schema and sensical XML elements could be used instead, as is
30# suggested by Adobe's own documentation on XMP (under "Extensibility of
31# Schemas").
32#
33# Information presented here on the /pdfx/ schema is a result of limited
34# reverse engineering, and does not constitute a full specification.
35PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/"
36
37iso8601 = re.compile("""
38        (?P<year>[0-9]{4})
39        (-
40            (?P<month>[0-9]{2})
41            (-
42                (?P<day>[0-9]+)
43                (T
44                    (?P<hour>[0-9]{2}):
45                    (?P<minute>[0-9]{2})
46                    (:(?P<second>[0-9]{2}(.[0-9]+)?))?
47                    (?P<tzd>Z|[-+][0-9]{2}:[0-9]{2})
48                )?
49            )?
50        )?
51        """, re.VERBOSE)
52
53
54class XmpInformation(PdfObject):
55    """
56    An object that represents Adobe XMP metadata.
57    Usually accessed by :meth:`getXmpMetadata()<PyPDF2.PdfFileReader.getXmpMetadata>`
58    """
59
60    def __init__(self, stream):
61        self.stream = stream
62        docRoot = parseString(self.stream.getData())
63        self.rdfRoot = docRoot.getElementsByTagNameNS(RDF_NAMESPACE, "RDF")[0]
64        self.cache = {}
65
66    def writeToStream(self, stream, encryption_key):
67        self.stream.writeToStream(stream, encryption_key)
68
69    def getElement(self, aboutUri, namespace, name):
70        for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
71            if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri:
72                attr = desc.getAttributeNodeNS(namespace, name)
73                if attr != None:
74                    yield attr
75                for element in desc.getElementsByTagNameNS(namespace, name):
76                    yield element
77
78    def getNodesInNamespace(self, aboutUri, namespace):
79        for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
80            if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri:
81                for i in range(desc.attributes.length):
82                    attr = desc.attributes.item(i)
83                    if attr.namespaceURI == namespace:
84                        yield attr
85                for child in desc.childNodes:
86                    if child.namespaceURI == namespace:
87                        yield child
88
89    def _getText(self, element):
90        text = ""
91        for child in element.childNodes:
92            if child.nodeType == child.TEXT_NODE:
93                text += child.data
94        return text
95
96    def _converter_string(value):
97        return value
98
99    def _converter_date(value):
100        m = iso8601.match(value)
101        year = int(m.group("year"))
102        month = int(m.group("month") or "1")
103        day = int(m.group("day") or "1")
104        hour = int(m.group("hour") or "0")
105        minute = int(m.group("minute") or "0")
106        second = decimal.Decimal(m.group("second") or "0")
107        seconds = second.to_integral(decimal.ROUND_FLOOR)
108        milliseconds = (second - seconds) * 1000000
109        tzd = m.group("tzd") or "Z"
110        dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds)
111        if tzd != "Z":
112            tzd_hours, tzd_minutes = [int(x) for x in tzd.split(":")]
113            tzd_hours *= -1
114            if tzd_hours < 0:
115                tzd_minutes *= -1
116            dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes)
117        return dt
118    _test_converter_date = staticmethod(_converter_date)
119
120    def _getter_bag(namespace, name, converter):
121        def get(self):
122            cached = self.cache.get(namespace, {}).get(name)
123            if cached:
124                return cached
125            retval = []
126            for element in self.getElement("", namespace, name):
127                bags = element.getElementsByTagNameNS(RDF_NAMESPACE, "Bag")
128                if len(bags):
129                    for bag in bags:
130                        for item in bag.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
131                            value = self._getText(item)
132                            value = converter(value)
133                            retval.append(value)
134            ns_cache = self.cache.setdefault(namespace, {})
135            ns_cache[name] = retval
136            return retval
137        return get
138
139    def _getter_seq(namespace, name, converter):
140        def get(self):
141            cached = self.cache.get(namespace, {}).get(name)
142            if cached:
143                return cached
144            retval = []
145            for element in self.getElement("", namespace, name):
146                seqs = element.getElementsByTagNameNS(RDF_NAMESPACE, "Seq")
147                if len(seqs):
148                    for seq in seqs:
149                        for item in seq.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
150                            value = self._getText(item)
151                            value = converter(value)
152                            retval.append(value)
153                else:
154                    value = converter(self._getText(element))
155                    retval.append(value)
156            ns_cache = self.cache.setdefault(namespace, {})
157            ns_cache[name] = retval
158            return retval
159        return get
160
161    def _getter_langalt(namespace, name, converter):
162        def get(self):
163            cached = self.cache.get(namespace, {}).get(name)
164            if cached:
165                return cached
166            retval = {}
167            for element in self.getElement("", namespace, name):
168                alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt")
169                if len(alts):
170                    for alt in alts:
171                        for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
172                            value = self._getText(item)
173                            value = converter(value)
174                            retval[item.getAttribute("xml:lang")] = value
175                else:
176                    retval["x-default"] = converter(self._getText(element))
177            ns_cache = self.cache.setdefault(namespace, {})
178            ns_cache[name] = retval
179            return retval
180        return get
181
182    def _getter_single(namespace, name, converter):
183        def get(self):
184            cached = self.cache.get(namespace, {}).get(name)
185            if cached:
186                return cached
187            value = None
188            for element in self.getElement("", namespace, name):
189                if element.nodeType == element.ATTRIBUTE_NODE:
190                    value = element.nodeValue
191                else:
192                    value = self._getText(element)
193                break
194            if value != None:
195                value = converter(value)
196            ns_cache = self.cache.setdefault(namespace, {})
197            ns_cache[name] = value
198            return value
199        return get
200
201    dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor", _converter_string))
202    """
203    Contributors to the resource (other than the authors). An unsorted
204    array of names.
205    """
206
207    dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage", _converter_string))
208    """
209    Text describing the extent or scope of the resource.
210    """
211
212    dc_creator = property(_getter_seq(DC_NAMESPACE, "creator", _converter_string))
213    """
214    A sorted array of names of the authors of the resource, listed in order
215    of precedence.
216    """
217
218    dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date))
219    """
220    A sorted array of dates (datetime.datetime instances) of signifigance to
221    the resource.  The dates and times are in UTC.
222    """
223
224    dc_description = property(_getter_langalt(DC_NAMESPACE, "description", _converter_string))
225    """
226    A language-keyed dictionary of textual descriptions of the content of the
227    resource.
228    """
229
230    dc_format = property(_getter_single(DC_NAMESPACE, "format", _converter_string))
231    """
232    The mime-type of the resource.
233    """
234
235    dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier", _converter_string))
236    """
237    Unique identifier of the resource.
238    """
239
240    dc_language = property(_getter_bag(DC_NAMESPACE, "language", _converter_string))
241    """
242    An unordered array specifying the languages used in the resource.
243    """
244
245    dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher", _converter_string))
246    """
247    An unordered array of publisher names.
248    """
249
250    dc_relation = property(_getter_bag(DC_NAMESPACE, "relation", _converter_string))
251    """
252    An unordered array of text descriptions of relationships to other
253    documents.
254    """
255
256    dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights", _converter_string))
257    """
258    A language-keyed dictionary of textual descriptions of the rights the
259    user has to this resource.
260    """
261
262    dc_source = property(_getter_single(DC_NAMESPACE, "source", _converter_string))
263    """
264    Unique identifier of the work from which this resource was derived.
265    """
266
267    dc_subject = property(_getter_bag(DC_NAMESPACE, "subject", _converter_string))
268    """
269    An unordered array of descriptive phrases or keywrods that specify the
270    topic of the content of the resource.
271    """
272
273    dc_title = property(_getter_langalt(DC_NAMESPACE, "title", _converter_string))
274    """
275    A language-keyed dictionary of the title of the resource.
276    """
277
278    dc_type = property(_getter_bag(DC_NAMESPACE, "type", _converter_string))
279    """
280    An unordered array of textual descriptions of the document type.
281    """
282
283    pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords", _converter_string))
284    """
285    An unformatted text string representing document keywords.
286    """
287
288    pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion", _converter_string))
289    """
290    The PDF file version, for example 1.0, 1.3.
291    """
292
293    pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer", _converter_string))
294    """
295    The name of the tool that created the PDF document.
296    """
297
298    xmp_createDate = property(_getter_single(XMP_NAMESPACE, "CreateDate", _converter_date))
299    """
300    The date and time the resource was originally created.  The date and
301    time are returned as a UTC datetime.datetime object.
302    """
303
304    xmp_modifyDate = property(_getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date))
305    """
306    The date and time the resource was last modified.  The date and time
307    are returned as a UTC datetime.datetime object.
308    """
309
310    xmp_metadataDate = property(_getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date))
311    """
312    The date and time that any metadata for this resource was last
313    changed.  The date and time are returned as a UTC datetime.datetime
314    object.
315    """
316
317    xmp_creatorTool = property(_getter_single(XMP_NAMESPACE, "CreatorTool", _converter_string))
318    """
319    The name of the first known tool used to create the resource.
320    """
321
322    xmpmm_documentId = property(_getter_single(XMPMM_NAMESPACE, "DocumentID", _converter_string))
323    """
324    The common identifier for all versions and renditions of this resource.
325    """
326
327    xmpmm_instanceId = property(_getter_single(XMPMM_NAMESPACE, "InstanceID", _converter_string))
328    """
329    An identifier for a specific incarnation of a document, updated each
330    time a file is saved.
331    """
332
333    def custom_properties(self):
334        if not hasattr(self, "_custom_properties"):
335            self._custom_properties = {}
336            for node in self.getNodesInNamespace("", PDFX_NAMESPACE):
337                key = node.localName
338                while True:
339                    # see documentation about PDFX_NAMESPACE earlier in file
340                    idx = key.find(u_("\u2182"))
341                    if idx == -1:
342                        break
343                    key = key[:idx] + chr(int(key[idx+1:idx+5], base=16)) + key[idx+5:]
344                if node.nodeType == node.ATTRIBUTE_NODE:
345                    value = node.nodeValue
346                else:
347                    value = self._getText(node)
348                self._custom_properties[key] = value
349        return self._custom_properties
350
351    custom_properties = property(custom_properties)
352    """
353    Retrieves custom metadata properties defined in the undocumented pdfx
354    metadata schema.
355
356    :return: a dictionary of key/value items for custom metadata properties.
357    :rtype: dict
358    """
359