1#!/usr/bin/python 2# Authors: James Migletz and Simson Garfinkel 3# Description: Program extracts metadata from 4# Microsoft Office 2007 packages. Extraction works on 5# .docx, .xlsx, and .pptx file types 6# To Do: 7# Incorporate recursive call to handled embedded docx files 8 9 10# 11# Filename: docx_extractor.py 12# Date: 27 Apr 2008 13# 14# See: http://www.diveintopython.org/xml_processing/index.html 15# http://python.active-venture.com/lib/dom-example.html 16 17import xml.dom.minidom 18import sys 19from subprocess import * 20debug = False 21 22# Only for future reference 23# This prints all the methods that company[0].firstChild can respond to 24 # It's another XML minidom object... 25 #print dir(company[0].firstChild) 26 #print dir(company[0]) 27 28def process_xml(xmlString): 29 30 if(len(xmlString)==0): return 31 # print "xml=",xmlString 32 xml_dom = xml.dom.minidom.parseString(xmlString) 33 if debug: 34 u = xml_dom.toprettyxml(" ") 35 print u.encode('ascii','replace') 36 37 # try to find a paragraph revision ID 38 # links settings.xml and styles.xml 39 try: 40 rsid = xml_dom.getElementsByTagName("w:p")[0].getAttribute('w:rsidR') 41 42 if rsid and not(rsid in revisionIdArray): 43 revisionIdArray.append(rsid) 44 drillDownOutput("Paragraph-Revision-ID", len(revisionIdArray), rsid) 45 46 except IndexError: 47 pass 48 49 # try to find a default paragraph revision ID 50 # links to settings.xml and styles.xml 51 try: 52 rsidDef = xml_dom.getElementsByTagName("w:p")[0].getAttribute('w:rsidRDefault') 53 54 if rsidDef and not(rsidDef in idDefaultArray): 55 idDefaultArray.append(rsidDef) 56 drillDownOutput("Paragraph-Revision-ID-Default", len(revisionIdArray), rsidDef) 57 58 except IndexError: 59 pass 60 61 #try to find property text 62 #if empty -- ignore 63 try: 64 propertyText = xml_dom.getElementsByTagName("w:r")[0].getAttribute('w:t') 65 if not(propertyText in propertyTextArray) and propertyText: 66 propertyTextArray.append(propertyText) 67 drillDownOutput("Property-Text", len(propertyTextArray), propertyText) 68 69 #print "Property-Text",propertyText 70 except IndexError: 71 pass 72 73 # collect names associated with images 74 try: 75 imageFile = xml_dom.getElementsByTagName("pic:cNvPr") 76 drillDown("Image", imageFile, 'name') 77 except IndexError: 78 pass 79 80 # get GUIDs from files for customXml info 81 try: 82 guid = xml_dom.getElementsByTagName("w:guid") 83 drillDown("GUID", guid, 'w:val') 84 except IndexError: 85 pass 86 87 # get aliases for content control's structured data tag 88 try: 89 sdt_alias = xml_dom.getElementsByTagName("w:alias") 90 drillDown("Content-Control-Alias", sdt_alias, 'w:val') 91 except IndexError: 92 pass 93 94 # get tag names for content control's structured data tag 95 try: 96 sdt_tags = xml_dom.getElementsByTagName("w:tag") 97 drillDown("Content-Control", sdt_tags, 'w:val') 98 except IndexError: 99 pass 100 101 # get id nums for content control's structured data tag 102 try: 103 sdt_ids = xml_dom.getElementsByTagName("w:id") 104 drillDown("Content-Control-Id", sdt_ids, 'w:val') 105 except IndexError: 106 pass 107 108 # get data store id nums from customXml part 109 try: 110 dataStore_ids = xml_dom.getElementsByTagName("ds:datastoreItem") 111 drillDown("Data-Store-Item-Id", dataStore_ids, 'ds:itemID') 112 except IndexError: 113 pass 114 115 # get relationship information (files in archive) from document.xml.rels 116 try: 117 relTarget = xml_dom.getElementsByTagName("Relationship") 118 drillDown("Archive-File", relTarget, 'Target') 119 except IndexError: 120 pass 121 122 # Attempt to retrieve "traditional" metadata 123 # Some metadata is tied specifically to Word or PowerPoint 124 createdDate = xml_dom.getElementsByTagName("dcterms:created") 125 collectMetadata(createdDate, "Created: ") 126 127 modifiedDate = xml_dom.getElementsByTagName("dcterms:modified") 128 collectMetadata(modifiedDate, "Last-Modified: ") 129 130 creator = xml_dom.getElementsByTagName("dc:creator") 131 collectMetadata(creator, "Creator: ") 132 133 title = xml_dom.getElementsByTagName("dc:title") 134 collectMetadata(title, "Title: ") 135 136 subject = xml_dom.getElementsByTagName("dc:subject") 137 collectMetadata(subject, "Subject: ") 138 139 description = xml_dom.getElementsByTagName("dc:description") 140 collectMetadata(description, "Description: ") 141 142 keywords = xml_dom.getElementsByTagName("cp:keywords") 143 collectMetadata(keywords, "Keywords: ") 144 145 revisionNum = xml_dom.getElementsByTagName("cp:revision") 146 collectMetadata(revisionNum, "Revision: ") 147 148 lastMod = xml_dom.getElementsByTagName("cp:lastModifiedBy") 149 collectMetadata(lastMod, "LastSavedBy: ") 150 151 application = xml_dom.getElementsByTagName("Application") 152 collectMetadata(application, "Generator: ") 153 154 company = xml_dom.getElementsByTagName("Company") 155 collectMetadata(company, "Company: ") 156 157 template = xml_dom.getElementsByTagName("Template") 158 collectMetadata(template, "Template: ") 159 160 pages = xml_dom.getElementsByTagName("Pages") 161 collectMetadata(pages, "Number-of-Pages: ") 162 163 lines = xml_dom.getElementsByTagName("Lines") 164 collectMetadata(lines, "Number-of-Lines: ") 165 166 paragraphs = xml_dom.getElementsByTagName("Paragraphs") 167 collectMetadata(paragraphs, "Number-of-Paragraphs: ") 168 169 words = xml_dom.getElementsByTagName("Words") 170 collectMetadata(words, "Number-of-Words: ") 171 172 characters = xml_dom.getElementsByTagName("Characters") 173 collectMetadata(characters, "Number-of-Characters: ") 174 175 slides = xml_dom.getElementsByTagName("Slides") 176 collectMetadata(slides, "Number-of-Slides: ") 177 178 hiddenSlides = xml_dom.getElementsByTagName("HiddenSlides") 179 collectMetadata(hiddenSlides, "Number-of-Hidden-Slides: ") 180 181 notesPages = xml_dom.getElementsByTagName("Notes") 182 collectMetadata(notesPages, "Number-of-Notes: ") 183 184 mediaClips = xml_dom.getElementsByTagName("MMClips") 185 collectMetadata(mediaClips, "Number-of-'Multi-Media'-Clips: ") 186 187 presFormat = xml_dom.getElementsByTagName("PresentationFormat") 188 collectMetadata(presFormat, "Presentation-Format: ") 189 190 191# method drills into xml when there is more than one element 192# associated with a tag name within a file 193# Secondary check is completed to ignore duplicate entries 194# and to insure label is unique 195# 196# Parameters 197# label - to be printed with value of attribute 198# tag_array - array of minidom objects 199# tag_name - value to be obtained 200def drillDown(label, tag_array, tag_name): 201 total_count = range(0, tag_array.length) 202 203 for x in total_count: 204 val = tag_array[x].getAttribute(tag_name) 205 if val: 206 207 if label.startswith("Archive-File"): 208 if not(val in targetArray): 209 targetArray.append(val) 210 drillDownOutput(label, len(targetArray), val) 211 212 elif label.endswith("Content-Control"): 213 if not(val in sdtTagArray): 214 sdtTagArray.append(val) 215 drillDownOutput(label, len(sdtTagArray), val) 216 217 elif label.endswith("Content-Control-Id"): 218 if not(val in sdtIdArray): 219 sdtIdArray.append(val) 220 drillDownOutput(label, len(sdtIdArray), val) 221 222 elif label.endswith("Content-Control-Alias"): 223 if not(val in sdtAliasArray): 224 sdtAliasArray.append(val) 225 drillDownOutput(label, len(sdtAliasArray), val) 226 227 elif label.endswith("GUID"): 228 if not(val in GUID_Array): 229 GUID_Array.append(val) 230 drillDownOutput(label, len(GUID_Array), val) 231 232 elif label.endswith("Data-Store-Item-Id"): 233 if not(val in dataStoreArray): 234 dataStoreArray.append(val) 235 drillDownOutput(label, len(dataStoreArray), val) 236 237 elif label.endswith("Image"): 238 if not(val in imageArray): 239 imageArray.append(val) 240 drillDownOutput(label, len(imageArray), val) 241 else: 242 print label + ":",val 243 244 245# method prints the output in label : value format (DGI) 246# Parameters: 247# label - to be printed with text of object 248# count - number of times label has appeared 249# with unique values 250# value - value to be printed with label 251def drillDownOutput(label, count, value): 252 print label+ `count` + ":",value 253 254 255# method prints metadata associated with an array 256# of minidom objects, and prints label and value 257# if one exists 258# Parameters: 259# tagArray - array of minidom objects 260# label - to be printed with text of object 261def collectMetadata(tagArray, label): 262 if tagArray and tagArray[0].hasChildNodes(): 263 text = tagArray[0].firstChild.wholeText.strip().replace("\r"," ").replace("\n"," ") 264 print label,text 265 266def process(fn): 267 #define and initialize counters 268 import zipfile 269 global targetCounter 270 targetCounter = 0 271 272 if not zipfile.is_zipfile(fn): 273 return 274 275 z = zipfile.ZipFile(fn,mode="r") 276 for f in z.namelist(): 277 if f.endswith(".xml") or f.endswith(".rels"): 278 process_xml(z.open(f).read()) 279 280 281 282 283#define and initialize arrays/lists for tags 284targetArray = [] 285idDefaultArray = [] 286revisionIdArray = [] 287sdtTagArray = [] 288sdtIdArray = [] 289sdtAliasArray = [] 290imageArray = [] 291GUID_Array = [] 292dataStoreArray = [] 293propertyTextArray = [] 294imageArray = [] 295 296#start the program here 297if (len(sys.argv) < 2): 298 print "Usage: docx_extractor filename.***x" 299 sys.exit() 300else: 301 if(__name__=="__main__"): 302 process(sys.argv[1]) 303 304