1import re 2import datetime 3import decimal 4from .generic import PdfObject 5from xml.dom import getDOMImplementation 6from xml.dom.minidom import parseString 7from .utils import u_ 8 9RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#" 10DC_NAMESPACE = "http://purl.org/dc/elements/1.1/" 11XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/" 12PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/" 13XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/" 14 15# What is the PDFX namespace, you might ask? I might ask that too. It's 16# a completely undocumented namespace used to place "custom metadata" 17# properties, which are arbitrary metadata properties with no semantic or 18# documented meaning. Elements in the namespace are key/value-style storage, 19# where the element name is the key and the content is the value. The keys 20# are transformed into valid XML identifiers by substituting an invalid 21# identifier character with \u2182 followed by the unicode hex ID of the 22# original character. A key like "my car" is therefore "my\u21820020car". 23# 24# \u2182, in case you're wondering, is the unicode character 25# \u{ROMAN NUMERAL TEN THOUSAND}, a straightforward and obvious choice for 26# escaping characters. 27# 28# Intentional users of the pdfx namespace should be shot on sight. A 29# custom data schema and sensical XML elements could be used instead, as is 30# suggested by Adobe's own documentation on XMP (under "Extensibility of 31# Schemas"). 32# 33# Information presented here on the /pdfx/ schema is a result of limited 34# reverse engineering, and does not constitute a full specification. 35PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/" 36 37iso8601 = re.compile(""" 38 (?P<year>[0-9]{4}) 39 (- 40 (?P<month>[0-9]{2}) 41 (- 42 (?P<day>[0-9]+) 43 (T 44 (?P<hour>[0-9]{2}): 45 (?P<minute>[0-9]{2}) 46 (:(?P<second>[0-9]{2}(.[0-9]+)?))? 47 (?P<tzd>Z|[-+][0-9]{2}:[0-9]{2}) 48 )? 49 )? 50 )? 51 """, re.VERBOSE) 52 53 54class XmpInformation(PdfObject): 55 """ 56 An object that represents Adobe XMP metadata. 57 Usually accessed by :meth:`getXmpMetadata()<PyPDF2.PdfFileReader.getXmpMetadata>` 58 """ 59 60 def __init__(self, stream): 61 self.stream = stream 62 docRoot = parseString(self.stream.getData()) 63 self.rdfRoot = docRoot.getElementsByTagNameNS(RDF_NAMESPACE, "RDF")[0] 64 self.cache = {} 65 66 def writeToStream(self, stream, encryption_key): 67 self.stream.writeToStream(stream, encryption_key) 68 69 def getElement(self, aboutUri, namespace, name): 70 for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): 71 if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri: 72 attr = desc.getAttributeNodeNS(namespace, name) 73 if attr != None: 74 yield attr 75 for element in desc.getElementsByTagNameNS(namespace, name): 76 yield element 77 78 def getNodesInNamespace(self, aboutUri, namespace): 79 for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"): 80 if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri: 81 for i in range(desc.attributes.length): 82 attr = desc.attributes.item(i) 83 if attr.namespaceURI == namespace: 84 yield attr 85 for child in desc.childNodes: 86 if child.namespaceURI == namespace: 87 yield child 88 89 def _getText(self, element): 90 text = "" 91 for child in element.childNodes: 92 if child.nodeType == child.TEXT_NODE: 93 text += child.data 94 return text 95 96 def _converter_string(value): 97 return value 98 99 def _converter_date(value): 100 m = iso8601.match(value) 101 year = int(m.group("year")) 102 month = int(m.group("month") or "1") 103 day = int(m.group("day") or "1") 104 hour = int(m.group("hour") or "0") 105 minute = int(m.group("minute") or "0") 106 second = decimal.Decimal(m.group("second") or "0") 107 seconds = second.to_integral(decimal.ROUND_FLOOR) 108 milliseconds = (second - seconds) * 1000000 109 tzd = m.group("tzd") or "Z" 110 dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds) 111 if tzd != "Z": 112 tzd_hours, tzd_minutes = [int(x) for x in tzd.split(":")] 113 tzd_hours *= -1 114 if tzd_hours < 0: 115 tzd_minutes *= -1 116 dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes) 117 return dt 118 _test_converter_date = staticmethod(_converter_date) 119 120 def _getter_bag(namespace, name, converter): 121 def get(self): 122 cached = self.cache.get(namespace, {}).get(name) 123 if cached: 124 return cached 125 retval = [] 126 for element in self.getElement("", namespace, name): 127 bags = element.getElementsByTagNameNS(RDF_NAMESPACE, "Bag") 128 if len(bags): 129 for bag in bags: 130 for item in bag.getElementsByTagNameNS(RDF_NAMESPACE, "li"): 131 value = self._getText(item) 132 value = converter(value) 133 retval.append(value) 134 ns_cache = self.cache.setdefault(namespace, {}) 135 ns_cache[name] = retval 136 return retval 137 return get 138 139 def _getter_seq(namespace, name, converter): 140 def get(self): 141 cached = self.cache.get(namespace, {}).get(name) 142 if cached: 143 return cached 144 retval = [] 145 for element in self.getElement("", namespace, name): 146 seqs = element.getElementsByTagNameNS(RDF_NAMESPACE, "Seq") 147 if len(seqs): 148 for seq in seqs: 149 for item in seq.getElementsByTagNameNS(RDF_NAMESPACE, "li"): 150 value = self._getText(item) 151 value = converter(value) 152 retval.append(value) 153 else: 154 value = converter(self._getText(element)) 155 retval.append(value) 156 ns_cache = self.cache.setdefault(namespace, {}) 157 ns_cache[name] = retval 158 return retval 159 return get 160 161 def _getter_langalt(namespace, name, converter): 162 def get(self): 163 cached = self.cache.get(namespace, {}).get(name) 164 if cached: 165 return cached 166 retval = {} 167 for element in self.getElement("", namespace, name): 168 alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt") 169 if len(alts): 170 for alt in alts: 171 for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"): 172 value = self._getText(item) 173 value = converter(value) 174 retval[item.getAttribute("xml:lang")] = value 175 else: 176 retval["x-default"] = converter(self._getText(element)) 177 ns_cache = self.cache.setdefault(namespace, {}) 178 ns_cache[name] = retval 179 return retval 180 return get 181 182 def _getter_single(namespace, name, converter): 183 def get(self): 184 cached = self.cache.get(namespace, {}).get(name) 185 if cached: 186 return cached 187 value = None 188 for element in self.getElement("", namespace, name): 189 if element.nodeType == element.ATTRIBUTE_NODE: 190 value = element.nodeValue 191 else: 192 value = self._getText(element) 193 break 194 if value != None: 195 value = converter(value) 196 ns_cache = self.cache.setdefault(namespace, {}) 197 ns_cache[name] = value 198 return value 199 return get 200 201 dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor", _converter_string)) 202 """ 203 Contributors to the resource (other than the authors). An unsorted 204 array of names. 205 """ 206 207 dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage", _converter_string)) 208 """ 209 Text describing the extent or scope of the resource. 210 """ 211 212 dc_creator = property(_getter_seq(DC_NAMESPACE, "creator", _converter_string)) 213 """ 214 A sorted array of names of the authors of the resource, listed in order 215 of precedence. 216 """ 217 218 dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date)) 219 """ 220 A sorted array of dates (datetime.datetime instances) of signifigance to 221 the resource. The dates and times are in UTC. 222 """ 223 224 dc_description = property(_getter_langalt(DC_NAMESPACE, "description", _converter_string)) 225 """ 226 A language-keyed dictionary of textual descriptions of the content of the 227 resource. 228 """ 229 230 dc_format = property(_getter_single(DC_NAMESPACE, "format", _converter_string)) 231 """ 232 The mime-type of the resource. 233 """ 234 235 dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier", _converter_string)) 236 """ 237 Unique identifier of the resource. 238 """ 239 240 dc_language = property(_getter_bag(DC_NAMESPACE, "language", _converter_string)) 241 """ 242 An unordered array specifying the languages used in the resource. 243 """ 244 245 dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher", _converter_string)) 246 """ 247 An unordered array of publisher names. 248 """ 249 250 dc_relation = property(_getter_bag(DC_NAMESPACE, "relation", _converter_string)) 251 """ 252 An unordered array of text descriptions of relationships to other 253 documents. 254 """ 255 256 dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights", _converter_string)) 257 """ 258 A language-keyed dictionary of textual descriptions of the rights the 259 user has to this resource. 260 """ 261 262 dc_source = property(_getter_single(DC_NAMESPACE, "source", _converter_string)) 263 """ 264 Unique identifier of the work from which this resource was derived. 265 """ 266 267 dc_subject = property(_getter_bag(DC_NAMESPACE, "subject", _converter_string)) 268 """ 269 An unordered array of descriptive phrases or keywrods that specify the 270 topic of the content of the resource. 271 """ 272 273 dc_title = property(_getter_langalt(DC_NAMESPACE, "title", _converter_string)) 274 """ 275 A language-keyed dictionary of the title of the resource. 276 """ 277 278 dc_type = property(_getter_bag(DC_NAMESPACE, "type", _converter_string)) 279 """ 280 An unordered array of textual descriptions of the document type. 281 """ 282 283 pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords", _converter_string)) 284 """ 285 An unformatted text string representing document keywords. 286 """ 287 288 pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion", _converter_string)) 289 """ 290 The PDF file version, for example 1.0, 1.3. 291 """ 292 293 pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer", _converter_string)) 294 """ 295 The name of the tool that created the PDF document. 296 """ 297 298 xmp_createDate = property(_getter_single(XMP_NAMESPACE, "CreateDate", _converter_date)) 299 """ 300 The date and time the resource was originally created. The date and 301 time are returned as a UTC datetime.datetime object. 302 """ 303 304 xmp_modifyDate = property(_getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date)) 305 """ 306 The date and time the resource was last modified. The date and time 307 are returned as a UTC datetime.datetime object. 308 """ 309 310 xmp_metadataDate = property(_getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date)) 311 """ 312 The date and time that any metadata for this resource was last 313 changed. The date and time are returned as a UTC datetime.datetime 314 object. 315 """ 316 317 xmp_creatorTool = property(_getter_single(XMP_NAMESPACE, "CreatorTool", _converter_string)) 318 """ 319 The name of the first known tool used to create the resource. 320 """ 321 322 xmpmm_documentId = property(_getter_single(XMPMM_NAMESPACE, "DocumentID", _converter_string)) 323 """ 324 The common identifier for all versions and renditions of this resource. 325 """ 326 327 xmpmm_instanceId = property(_getter_single(XMPMM_NAMESPACE, "InstanceID", _converter_string)) 328 """ 329 An identifier for a specific incarnation of a document, updated each 330 time a file is saved. 331 """ 332 333 def custom_properties(self): 334 if not hasattr(self, "_custom_properties"): 335 self._custom_properties = {} 336 for node in self.getNodesInNamespace("", PDFX_NAMESPACE): 337 key = node.localName 338 while True: 339 # see documentation about PDFX_NAMESPACE earlier in file 340 idx = key.find(u_("\u2182")) 341 if idx == -1: 342 break 343 key = key[:idx] + chr(int(key[idx+1:idx+5], base=16)) + key[idx+5:] 344 if node.nodeType == node.ATTRIBUTE_NODE: 345 value = node.nodeValue 346 else: 347 value = self._getText(node) 348 self._custom_properties[key] = value 349 return self._custom_properties 350 351 custom_properties = property(custom_properties) 352 """ 353 Retrieves custom metadata properties defined in the undocumented pdfx 354 metadata schema. 355 356 :return: a dictionary of key/value items for custom metadata properties. 357 :rtype: dict 358 """ 359