1import re
2import datetime
3import decimal
4from generic import PdfObject
5from xml.dom import getDOMImplementation
6from xml.dom.minidom import parseString
7
8RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
9DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"
10XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/"
11PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/"
12XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/"
13
14# What is the PDFX namespace, you might ask?  I might ask that too.  It's
15# a completely undocumented namespace used to place "custom metadata"
16# properties, which are arbitrary metadata properties with no semantic or
17# documented meaning.  Elements in the namespace are key/value-style storage,
18# where the element name is the key and the content is the value.  The keys
19# are transformed into valid XML identifiers by substituting an invalid
20# identifier character with \u2182 followed by the unicode hex ID of the
21# original character.  A key like "my car" is therefore "my\u21820020car".
22#
23# \u2182, in case you're wondering, is the unicode character
24# \u{ROMAN NUMERAL TEN THOUSAND}, a straightforward and obvious choice for
25# escaping characters.
26#
27# Intentional users of the pdfx namespace should be shot on sight.  A
28# custom data schema and sensical XML elements could be used instead, as is
29# suggested by Adobe's own documentation on XMP (under "Extensibility of
30# Schemas").
31#
32# Information presented here on the /pdfx/ schema is a result of limited
33# reverse engineering, and does not constitute a full specification.
34PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/"
35
36iso8601 = re.compile("""
37        (?P<year>[0-9]{4})
38        (-
39            (?P<month>[0-9]{2})
40            (-
41                (?P<day>[0-9]+)
42                (T
43                    (?P<hour>[0-9]{2}):
44                    (?P<minute>[0-9]{2})
45                    (:(?P<second>[0-9]{2}(.[0-9]+)?))?
46                    (?P<tzd>Z|[-+][0-9]{2}:[0-9]{2})
47                )?
48            )?
49        )?
50        """, re.VERBOSE)
51
52##
53# An object that represents Adobe XMP metadata.
54class XmpInformation(PdfObject):
55
56    def __init__(self, stream):
57        self.stream = stream
58        docRoot = parseString(self.stream.getData())
59        self.rdfRoot = docRoot.getElementsByTagNameNS(RDF_NAMESPACE, "RDF")[0]
60        self.cache = {}
61
62    def writeToStream(self, stream, encryption_key):
63        self.stream.writeToStream(stream, encryption_key)
64
65    def getElement(self, aboutUri, namespace, name):
66        for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
67            if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri:
68                attr = desc.getAttributeNodeNS(namespace, name)
69                if attr != None:
70                    yield attr
71                for element in desc.getElementsByTagNameNS(namespace, name):
72                    yield element
73
74    def getNodesInNamespace(self, aboutUri, namespace):
75        for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
76            if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri:
77                for i in range(desc.attributes.length):
78                    attr = desc.attributes.item(i)
79                    if attr.namespaceURI == namespace:
80                        yield attr
81                for child in desc.childNodes:
82                    if child.namespaceURI == namespace:
83                        yield child
84
85    def _getText(self, element):
86        text = ""
87        for child in element.childNodes:
88            if child.nodeType == child.TEXT_NODE:
89                text += child.data
90        return text
91
92    def _converter_string(value):
93        return value
94
95    def _converter_date(value):
96        m = iso8601.match(value)
97        year = int(m.group("year"))
98        month = int(m.group("month") or "1")
99        day = int(m.group("day") or "1")
100        hour = int(m.group("hour") or "0")
101        minute = int(m.group("minute") or "0")
102        second = decimal.Decimal(m.group("second") or "0")
103        seconds = second.to_integral(decimal.ROUND_FLOOR)
104        milliseconds = (second - seconds) * 1000000
105        tzd = m.group("tzd") or "Z"
106        dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds)
107        if tzd != "Z":
108            tzd_hours, tzd_minutes = [int(x) for x in tzd.split(":")]
109            tzd_hours *= -1
110            if tzd_hours < 0:
111                tzd_minutes *= -1
112            dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes)
113        return dt
114    _test_converter_date = staticmethod(_converter_date)
115
116    def _getter_bag(namespace, name, converter):
117        def get(self):
118            cached = self.cache.get(namespace, {}).get(name)
119            if cached:
120                return cached
121            retval = []
122            for element in self.getElement("", namespace, name):
123                bags = element.getElementsByTagNameNS(RDF_NAMESPACE, "Bag")
124                if len(bags):
125                    for bag in bags:
126                        for item in bag.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
127                            value = self._getText(item)
128                            value = converter(value)
129                            retval.append(value)
130            ns_cache = self.cache.setdefault(namespace, {})
131            ns_cache[name] = retval
132            return retval
133        return get
134
135    def _getter_seq(namespace, name, converter):
136        def get(self):
137            cached = self.cache.get(namespace, {}).get(name)
138            if cached:
139                return cached
140            retval = []
141            for element in self.getElement("", namespace, name):
142                seqs = element.getElementsByTagNameNS(RDF_NAMESPACE, "Seq")
143                if len(seqs):
144                    for seq in seqs:
145                        for item in seq.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
146                            value = self._getText(item)
147                            value = converter(value)
148                            retval.append(value)
149                else:
150                    value = converter(self._getText(element))
151                    retval.append(value)
152            ns_cache = self.cache.setdefault(namespace, {})
153            ns_cache[name] = retval
154            return retval
155        return get
156
157    def _getter_langalt(namespace, name, converter):
158        def get(self):
159            cached = self.cache.get(namespace, {}).get(name)
160            if cached:
161                return cached
162            retval = {}
163            for element in self.getElement("", namespace, name):
164                alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt")
165                if len(alts):
166                    for alt in alts:
167                        for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
168                            value = self._getText(item)
169                            value = converter(value)
170                            retval[item.getAttribute("xml:lang")] = value
171                else:
172                    retval["x-default"] = converter(self._getText(element))
173            ns_cache = self.cache.setdefault(namespace, {})
174            ns_cache[name] = retval
175            return retval
176        return get
177
178    def _getter_single(namespace, name, converter):
179        def get(self):
180            cached = self.cache.get(namespace, {}).get(name)
181            if cached:
182                return cached
183            value = None
184            for element in self.getElement("", namespace, name):
185                if element.nodeType == element.ATTRIBUTE_NODE:
186                    value = element.nodeValue
187                else:
188                    value = self._getText(element)
189                break
190            if value != None:
191                value = converter(value)
192            ns_cache = self.cache.setdefault(namespace, {})
193            ns_cache[name] = value
194            return value
195        return get
196
197    ##
198    # Contributors to the resource (other than the authors).  An unsorted
199    # array of names.
200    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
201    dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor", _converter_string))
202
203    ##
204    # Text describing the extent or scope of the resource.
205    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
206    dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage", _converter_string))
207
208    ##
209    # A sorted array of names of the authors of the resource, listed in order
210    # of precedence.
211    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
212    dc_creator = property(_getter_seq(DC_NAMESPACE, "creator", _converter_string))
213
214    ##
215    # A sorted array of dates (datetime.datetime instances) of signifigance to
216    # the resource.  The dates and times are in UTC.
217    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
218    dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date))
219
220    ##
221    # A language-keyed dictionary of textual descriptions of the content of the
222    # resource.
223    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
224    dc_description = property(_getter_langalt(DC_NAMESPACE, "description", _converter_string))
225
226    ##
227    # The mime-type of the resource.
228    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
229    dc_format = property(_getter_single(DC_NAMESPACE, "format", _converter_string))
230
231    ##
232    # Unique identifier of the resource.
233    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
234    dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier", _converter_string))
235
236    ##
237    # An unordered array specifying the languages used in the resource.
238    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
239    dc_language = property(_getter_bag(DC_NAMESPACE, "language", _converter_string))
240
241    ##
242    # An unordered array of publisher names.
243    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
244    dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher", _converter_string))
245
246    ##
247    # An unordered array of text descriptions of relationships to other
248    # documents.
249    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
250    dc_relation = property(_getter_bag(DC_NAMESPACE, "relation", _converter_string))
251
252    ##
253    # A language-keyed dictionary of textual descriptions of the rights the
254    # user has to this resource.
255    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
256    dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights", _converter_string))
257
258    ##
259    # Unique identifier of the work from which this resource was derived.
260    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
261    dc_source = property(_getter_single(DC_NAMESPACE, "source", _converter_string))
262
263    ##
264    # An unordered array of descriptive phrases or keywrods that specify the
265    # topic of the content of the resource.
266    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
267    dc_subject = property(_getter_bag(DC_NAMESPACE, "subject", _converter_string))
268
269    ##
270    # A language-keyed dictionary of the title of the resource.
271    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
272    dc_title = property(_getter_langalt(DC_NAMESPACE, "title", _converter_string))
273
274    ##
275    # An unordered array of textual descriptions of the document type.
276    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
277    dc_type = property(_getter_bag(DC_NAMESPACE, "type", _converter_string))
278
279    ##
280    # An unformatted text string representing document keywords.
281    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
282    pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords", _converter_string))
283
284    ##
285    # The PDF file version, for example 1.0, 1.3.
286    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
287    pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion", _converter_string))
288
289    ##
290    # The name of the tool that created the PDF document.
291    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
292    pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer", _converter_string))
293
294    ##
295    # The date and time the resource was originally created.  The date and
296    # time are returned as a UTC datetime.datetime object.
297    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
298    xmp_createDate = property(_getter_single(XMP_NAMESPACE, "CreateDate", _converter_date))
299
300    ##
301    # The date and time the resource was last modified.  The date and time
302    # are returned as a UTC datetime.datetime object.
303    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
304    xmp_modifyDate = property(_getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date))
305
306    ##
307    # The date and time that any metadata for this resource was last
308    # changed.  The date and time are returned as a UTC datetime.datetime
309    # object.
310    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
311    xmp_metadataDate = property(_getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date))
312
313    ##
314    # The name of the first known tool used to create the resource.
315    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
316    xmp_creatorTool = property(_getter_single(XMP_NAMESPACE, "CreatorTool", _converter_string))
317
318    ##
319    # The common identifier for all versions and renditions of this resource.
320    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
321    xmpmm_documentId = property(_getter_single(XMPMM_NAMESPACE, "DocumentID", _converter_string))
322
323    ##
324    # An identifier for a specific incarnation of a document, updated each
325    # time a file is saved.
326    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
327    xmpmm_instanceId = property(_getter_single(XMPMM_NAMESPACE, "InstanceID", _converter_string))
328
329    def custom_properties(self):
330        if not hasattr(self, "_custom_properties"):
331            self._custom_properties = {}
332            for node in self.getNodesInNamespace("", PDFX_NAMESPACE):
333                key = node.localName
334                while True:
335                    # see documentation about PDFX_NAMESPACE earlier in file
336                    idx = key.find(u"\u2182")
337                    if idx == -1:
338                        break
339                    key = key[:idx] + chr(int(key[idx+1:idx+5], base=16)) + key[idx+5:]
340                if node.nodeType == node.ATTRIBUTE_NODE:
341                    value = node.nodeValue
342                else:
343                    value = self._getText(node)
344                self._custom_properties[key] = value
345        return self._custom_properties
346
347    ##
348    # Retrieves custom metadata properties defined in the undocumented pdfx
349    # metadata schema.
350    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
351    # @return Returns a dictionary of key/value items for custom metadata
352    # properties.
353    custom_properties = property(custom_properties)
354
355
356