1import re 2import datetime 3import decimal 4from generic import PdfObject 5from xml.dom import getDOMImplementation 6from xml.dom.minidom import parseString 7 8RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" 9DC_NAMESPACE = "http://purl.org/dc/elements/1.1/" 10XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/" 11PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/" 12XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/" 13 14# What is the PDFX namespace, you might ask? I might ask that too. It's 15# a completely undocumented namespace used to place "custom metadata" 16# properties, which are arbitrary metadata properties with no semantic or 17# documented meaning. Elements in the namespace are key/value-style storage, 18# where the element name is the key and the content is the value. The keys 19# are transformed into valid XML identifiers by substituting an invalid 20# identifier character with \u2182 followed by the unicode hex ID of the 21# original character. A key like "my car" is therefore "my\u21820020car". 22# 23# \u2182, in case you're wondering, is the unicode character 24# \u{ROMAN NUMERAL TEN THOUSAND}, a straightforward and obvious choice for 25# escaping characters. 26# 27# Intentional users of the pdfx namespace should be shot on sight. A 28# custom data schema and sensical XML elements could be used instead, as is 29# suggested by Adobe's own documentation on XMP (under "Extensibility of 30# Schemas"). 31# 32# Information presented here on the /pdfx/ schema is a result of limited 33# reverse engineering, and does not constitute a full specification. 34PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/" 35 36iso8601 = re.compile(""" 37 (?P<year>[0-9]{4}) 38 (- 39 (?P<month>[0-9]{2}) 40 (- 41 (?P<day>[0-9]+) 42 (T 43 (?P<hour>[0-9]{2}): 44 (?P<minute>[0-9]{2}) 45 (:(?P<second>[0-9]{2}(.[0-9]+)?))? 46 (?P<tzd>Z|[-+][0-9]{2}:[0-9]{2}) 47 )? 48 )? 49 )? 50 """, re.VERBOSE) 51 52## 53# An object that represents Adobe XMP metadata. 54class XmpInformation(PdfObject): 55 56 def __init__(self, stream): 57 self.stream = stream 58 docRoot = parseString(self.stream.getData()) 59 self.rdfRoot = docRoot.getElementsByTagNameNS(RDF_NAMESPACE, "RDF")[0] 60 self.cache = {} 61 62 def writeToStream(self, stream, encryption_key): 63 self.stream.writeToStream(stream, encryption_key) 64 65 def getElement(self, aboutUri, namespace, name): 66 for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): 67 if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri: 68 attr = desc.getAttributeNodeNS(namespace, name) 69 if attr != None: 70 yield attr 71 for element in desc.getElementsByTagNameNS(namespace, name): 72 yield element 73 74 def getNodesInNamespace(self, aboutUri, namespace): 75 for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): 76 if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri: 77 for i in range(desc.attributes.length): 78 attr = desc.attributes.item(i) 79 if attr.namespaceURI == namespace: 80 yield attr 81 for child in desc.childNodes: 82 if child.namespaceURI == namespace: 83 yield child 84 85 def _getText(self, element): 86 text = "" 87 for child in element.childNodes: 88 if child.nodeType == child.TEXT_NODE: 89 text += child.data 90 return text 91 92 def _converter_string(value): 93 return value 94 95 def _converter_date(value): 96 m = iso8601.match(value) 97 year = int(m.group("year")) 98 month = int(m.group("month") or "1") 99 day = int(m.group("day") or "1") 100 hour = int(m.group("hour") or "0") 101 minute = int(m.group("minute") or "0") 102 second = decimal.Decimal(m.group("second") or "0") 103 seconds = second.to_integral(decimal.ROUND_FLOOR) 104 milliseconds = (second - seconds) * 1000000 105 tzd = m.group("tzd") or "Z" 106 dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds) 107 if tzd != "Z": 108 tzd_hours, tzd_minutes = [int(x) for x in tzd.split(":")] 109 tzd_hours *= -1 110 if tzd_hours < 0: 111 tzd_minutes *= -1 112 dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes) 113 return dt 114 _test_converter_date = staticmethod(_converter_date) 115 116 def _getter_bag(namespace, name, converter): 117 def get(self): 118 cached = self.cache.get(namespace, {}).get(name) 119 if cached: 120 return cached 121 retval = [] 122 for element in self.getElement("", namespace, name): 123 bags = element.getElementsByTagNameNS(RDF_NAMESPACE, "Bag") 124 if len(bags): 125 for bag in bags: 126 for item in bag.getElementsByTagNameNS(RDF_NAMESPACE, "li"): 127 value = self._getText(item) 128 value = converter(value) 129 retval.append(value) 130 ns_cache = self.cache.setdefault(namespace, {}) 131 ns_cache[name] = retval 132 return retval 133 return get 134 135 def _getter_seq(namespace, name, converter): 136 def get(self): 137 cached = self.cache.get(namespace, {}).get(name) 138 if cached: 139 return cached 140 retval = [] 141 for element in self.getElement("", namespace, name): 142 seqs = element.getElementsByTagNameNS(RDF_NAMESPACE, "Seq") 143 if len(seqs): 144 for seq in seqs: 145 for item in seq.getElementsByTagNameNS(RDF_NAMESPACE, "li"): 146 value = self._getText(item) 147 value = converter(value) 148 retval.append(value) 149 else: 150 value = converter(self._getText(element)) 151 retval.append(value) 152 ns_cache = self.cache.setdefault(namespace, {}) 153 ns_cache[name] = retval 154 return retval 155 return get 156 157 def _getter_langalt(namespace, name, converter): 158 def get(self): 159 cached = self.cache.get(namespace, {}).get(name) 160 if cached: 161 return cached 162 retval = {} 163 for element in self.getElement("", namespace, name): 164 alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt") 165 if len(alts): 166 for alt in alts: 167 for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"): 168 value = self._getText(item) 169 value = converter(value) 170 retval[item.getAttribute("xml:lang")] = value 171 else: 172 retval["x-default"] = converter(self._getText(element)) 173 ns_cache = self.cache.setdefault(namespace, {}) 174 ns_cache[name] = retval 175 return retval 176 return get 177 178 def _getter_single(namespace, name, converter): 179 def get(self): 180 cached = self.cache.get(namespace, {}).get(name) 181 if cached: 182 return cached 183 value = None 184 for element in self.getElement("", namespace, name): 185 if element.nodeType == element.ATTRIBUTE_NODE: 186 value = element.nodeValue 187 else: 188 value = self._getText(element) 189 break 190 if value != None: 191 value = converter(value) 192 ns_cache = self.cache.setdefault(namespace, {}) 193 ns_cache[name] = value 194 return value 195 return get 196 197 ## 198 # Contributors to the resource (other than the authors). An unsorted 199 # array of names. 200 # <p>Stability: Added in v1.12, will exist for all future v1.x releases. 201 dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor", _converter_string)) 202 203 ## 204 # Text describing the extent or scope of the resource. 205 # <p>Stability: Added in v1.12, will exist for all future v1.x releases. 206 dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage", _converter_string)) 207 208 ## 209 # A sorted array of names of the authors of the resource, listed in order 210 # of precedence. 211 # <p>Stability: Added in v1.12, will exist for all future v1.x releases. 212 dc_creator = property(_getter_seq(DC_NAMESPACE, "creator", _converter_string)) 213 214 ## 215 # A sorted array of dates (datetime.datetime instances) of signifigance to 216 # the resource. The dates and times are in UTC. 217 # <p>Stability: Added in v1.12, will exist for all future v1.x releases. 218 dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date)) 219 220 ## 221 # A language-keyed dictionary of textual descriptions of the content of the 222 # resource. 223 # <p>Stability: Added in v1.12, will exist for all future v1.x releases. 224 dc_description = property(_getter_langalt(DC_NAMESPACE, "description", _converter_string)) 225 226 ## 227 # The mime-type of the resource. 228 # <p>Stability: Added in v1.12, will exist for all future v1.x releases. 229 dc_format = property(_getter_single(DC_NAMESPACE, "format", _converter_string)) 230 231 ## 232 # Unique identifier of the resource. 233 # <p>Stability: Added in v1.12, will exist for all future v1.x releases. 234 dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier", _converter_string)) 235 236 ## 237 # An unordered array specifying the languages used in the resource. 238 # <p>Stability: Added in v1.12, will exist for all future v1.x releases. 239 dc_language = property(_getter_bag(DC_NAMESPACE, "language", _converter_string)) 240 241 ## 242 # An unordered array of publisher names. 243 # <p>Stability: Added in v1.12, will exist for all future v1.x releases. 244 dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher", _converter_string)) 245 246 ## 247 # An unordered array of text descriptions of relationships to other 248 # documents. 249 # <p>Stability: Added in v1.12, will exist for all future v1.x releases. 250 dc_relation = property(_getter_bag(DC_NAMESPACE, "relation", _converter_string)) 251 252 ## 253 # A language-keyed dictionary of textual descriptions of the rights the 254 # user has to this resource. 255 # <p>Stability: Added in v1.12, will exist for all future v1.x releases. 256 dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights", _converter_string)) 257 258 ## 259 # Unique identifier of the work from which this resource was derived. 260 # <p>Stability: Added in v1.12, will exist for all future v1.x releases. 261 dc_source = property(_getter_single(DC_NAMESPACE, "source", _converter_string)) 262 263 ## 264 # An unordered array of descriptive phrases or keywrods that specify the 265 # topic of the content of the resource. 266 # <p>Stability: Added in v1.12, will exist for all future v1.x releases. 267 dc_subject = property(_getter_bag(DC_NAMESPACE, "subject", _converter_string)) 268 269 ## 270 # A language-keyed dictionary of the title of the resource. 271 # <p>Stability: Added in v1.12, will exist for all future v1.x releases. 272 dc_title = property(_getter_langalt(DC_NAMESPACE, "title", _converter_string)) 273 274 ## 275 # An unordered array of textual descriptions of the document type. 276 # <p>Stability: Added in v1.12, will exist for all future v1.x releases. 277 dc_type = property(_getter_bag(DC_NAMESPACE, "type", _converter_string)) 278 279 ## 280 # An unformatted text string representing document keywords. 281 # <p>Stability: Added in v1.12, will exist for all future v1.x releases. 282 pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords", _converter_string)) 283 284 ## 285 # The PDF file version, for example 1.0, 1.3. 286 # <p>Stability: Added in v1.12, will exist for all future v1.x releases. 287 pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion", _converter_string)) 288 289 ## 290 # The name of the tool that created the PDF document. 291 # <p>Stability: Added in v1.12, will exist for all future v1.x releases. 292 pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer", _converter_string)) 293 294 ## 295 # The date and time the resource was originally created. The date and 296 # time are returned as a UTC datetime.datetime object. 297 # <p>Stability: Added in v1.12, will exist for all future v1.x releases. 298 xmp_createDate = property(_getter_single(XMP_NAMESPACE, "CreateDate", _converter_date)) 299 300 ## 301 # The date and time the resource was last modified. The date and time 302 # are returned as a UTC datetime.datetime object. 303 # <p>Stability: Added in v1.12, will exist for all future v1.x releases. 304 xmp_modifyDate = property(_getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date)) 305 306 ## 307 # The date and time that any metadata for this resource was last 308 # changed. The date and time are returned as a UTC datetime.datetime 309 # object. 310 # <p>Stability: Added in v1.12, will exist for all future v1.x releases. 311 xmp_metadataDate = property(_getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date)) 312 313 ## 314 # The name of the first known tool used to create the resource. 315 # <p>Stability: Added in v1.12, will exist for all future v1.x releases. 316 xmp_creatorTool = property(_getter_single(XMP_NAMESPACE, "CreatorTool", _converter_string)) 317 318 ## 319 # The common identifier for all versions and renditions of this resource. 320 # <p>Stability: Added in v1.12, will exist for all future v1.x releases. 321 xmpmm_documentId = property(_getter_single(XMPMM_NAMESPACE, "DocumentID", _converter_string)) 322 323 ## 324 # An identifier for a specific incarnation of a document, updated each 325 # time a file is saved. 326 # <p>Stability: Added in v1.12, will exist for all future v1.x releases. 327 xmpmm_instanceId = property(_getter_single(XMPMM_NAMESPACE, "InstanceID", _converter_string)) 328 329 def custom_properties(self): 330 if not hasattr(self, "_custom_properties"): 331 self._custom_properties = {} 332 for node in self.getNodesInNamespace("", PDFX_NAMESPACE): 333 key = node.localName 334 while True: 335 # see documentation about PDFX_NAMESPACE earlier in file 336 idx = key.find(u"\u2182") 337 if idx == -1: 338 break 339 key = key[:idx] + chr(int(key[idx+1:idx+5], base=16)) + key[idx+5:] 340 if node.nodeType == node.ATTRIBUTE_NODE: 341 value = node.nodeValue 342 else: 343 value = self._getText(node) 344 self._custom_properties[key] = value 345 return self._custom_properties 346 347 ## 348 # Retrieves custom metadata properties defined in the undocumented pdfx 349 # metadata schema. 350 # <p>Stability: Added in v1.12, will exist for all future v1.x releases. 351 # @return Returns a dictionary of key/value items for custom metadata 352 # properties. 353 custom_properties = property(custom_properties) 354 355 356