1#!/usr/bin/python
2# Authors:     James Migletz and Simson Garfinkel
3# Description: Program extracts metadata from
4# Microsoft Office 2007 packages. Extraction works on
5# .docx, .xlsx, and .pptx file types
6# To Do:
7# Incorporate recursive call to handled embedded docx files
8
9
10#
11# Filename:    docx_extractor.py
12# Date:        27 Apr 2008
13#
14# See: http://www.diveintopython.org/xml_processing/index.html
15# http://python.active-venture.com/lib/dom-example.html
16
17import xml.dom.minidom
18import sys
19from subprocess import *
20debug = False
21
22# Only for future reference
23# This prints all the methods that company[0].firstChild can respond to
24        # It's another XML minidom object...
25        #print dir(company[0].firstChild)
26        #print dir(company[0])
27
28def process_xml(xmlString):
29
30    if(len(xmlString)==0): return
31    # print "xml=",xmlString
32    xml_dom = xml.dom.minidom.parseString(xmlString)
33    if debug:
34        u = xml_dom.toprettyxml(" ")
35	print u.encode('ascii','replace')
36
37    # try to find a paragraph revision ID
38    # links settings.xml and styles.xml
39    try:
40        rsid = xml_dom.getElementsByTagName("w:p")[0].getAttribute('w:rsidR')
41
42        if rsid and not(rsid in revisionIdArray):
43               revisionIdArray.append(rsid)
44	       drillDownOutput("Paragraph-Revision-ID", len(revisionIdArray), rsid)
45
46    except IndexError:
47        pass
48
49    # try to find a default paragraph revision ID
50    # links to settings.xml and styles.xml
51    try:
52        rsidDef = xml_dom.getElementsByTagName("w:p")[0].getAttribute('w:rsidRDefault')
53
54        if rsidDef and not(rsidDef in idDefaultArray):
55               idDefaultArray.append(rsidDef)
56	       drillDownOutput("Paragraph-Revision-ID-Default", len(revisionIdArray), rsidDef)
57
58    except IndexError:
59        pass
60
61    #try to find property text
62    #if empty -- ignore
63    try:
64        propertyText = xml_dom.getElementsByTagName("w:r")[0].getAttribute('w:t')
65        if not(propertyText in propertyTextArray) and propertyText:
66               propertyTextArray.append(propertyText)
67	       drillDownOutput("Property-Text", len(propertyTextArray), propertyText)
68
69        #print "Property-Text",propertyText
70    except IndexError:
71        pass
72
73    # collect names associated with images
74    try:
75        imageFile = xml_dom.getElementsByTagName("pic:cNvPr")
76        drillDown("Image", imageFile, 'name')
77    except IndexError:
78        pass
79
80    # get GUIDs from files for customXml info
81    try:
82        guid = xml_dom.getElementsByTagName("w:guid")
83        drillDown("GUID", guid, 'w:val')
84    except IndexError:
85        pass
86
87    # get aliases for content control's structured data tag
88    try:
89        sdt_alias = xml_dom.getElementsByTagName("w:alias")
90        drillDown("Content-Control-Alias", sdt_alias, 'w:val')
91    except IndexError:
92        pass
93
94    # get tag names for content control's structured data tag
95    try:
96        sdt_tags = xml_dom.getElementsByTagName("w:tag")
97        drillDown("Content-Control", sdt_tags, 'w:val')
98    except IndexError:
99        pass
100
101    # get id nums for content control's structured data tag
102    try:
103        sdt_ids = xml_dom.getElementsByTagName("w:id")
104        drillDown("Content-Control-Id", sdt_ids, 'w:val')
105    except IndexError:
106        pass
107
108    # get data store id nums from customXml part
109    try:
110        dataStore_ids = xml_dom.getElementsByTagName("ds:datastoreItem")
111        drillDown("Data-Store-Item-Id", dataStore_ids, 'ds:itemID')
112    except IndexError:
113        pass
114
115    # get relationship information (files in archive) from document.xml.rels
116    try:
117        relTarget = xml_dom.getElementsByTagName("Relationship")
118        drillDown("Archive-File", relTarget, 'Target')
119    except IndexError:
120        pass
121
122    # Attempt to retrieve "traditional" metadata
123    # Some metadata is tied specifically to Word or PowerPoint
124    createdDate = xml_dom.getElementsByTagName("dcterms:created")
125    collectMetadata(createdDate, "Created: ")
126
127    modifiedDate = xml_dom.getElementsByTagName("dcterms:modified")
128    collectMetadata(modifiedDate, "Last-Modified: ")
129
130    creator = xml_dom.getElementsByTagName("dc:creator")
131    collectMetadata(creator, "Creator: ")
132
133    title = xml_dom.getElementsByTagName("dc:title")
134    collectMetadata(title, "Title: ")
135
136    subject = xml_dom.getElementsByTagName("dc:subject")
137    collectMetadata(subject, "Subject: ")
138
139    description = xml_dom.getElementsByTagName("dc:description")
140    collectMetadata(description, "Description: ")
141
142    keywords = xml_dom.getElementsByTagName("cp:keywords")
143    collectMetadata(keywords, "Keywords: ")
144
145    revisionNum = xml_dom.getElementsByTagName("cp:revision")
146    collectMetadata(revisionNum, "Revision: ")
147
148    lastMod = xml_dom.getElementsByTagName("cp:lastModifiedBy")
149    collectMetadata(lastMod, "LastSavedBy: ")
150
151    application = xml_dom.getElementsByTagName("Application")
152    collectMetadata(application, "Generator: ")
153
154    company = xml_dom.getElementsByTagName("Company")
155    collectMetadata(company, "Company: ")
156
157    template = xml_dom.getElementsByTagName("Template")
158    collectMetadata(template, "Template: ")
159
160    pages = xml_dom.getElementsByTagName("Pages")
161    collectMetadata(pages, "Number-of-Pages: ")
162
163    lines = xml_dom.getElementsByTagName("Lines")
164    collectMetadata(lines, "Number-of-Lines: ")
165
166    paragraphs = xml_dom.getElementsByTagName("Paragraphs")
167    collectMetadata(paragraphs, "Number-of-Paragraphs: ")
168
169    words = xml_dom.getElementsByTagName("Words")
170    collectMetadata(words, "Number-of-Words: ")
171
172    characters = xml_dom.getElementsByTagName("Characters")
173    collectMetadata(characters, "Number-of-Characters: ")
174
175    slides = xml_dom.getElementsByTagName("Slides")
176    collectMetadata(slides, "Number-of-Slides: ")
177
178    hiddenSlides = xml_dom.getElementsByTagName("HiddenSlides")
179    collectMetadata(hiddenSlides, "Number-of-Hidden-Slides: ")
180
181    notesPages = xml_dom.getElementsByTagName("Notes")
182    collectMetadata(notesPages, "Number-of-Notes: ")
183
184    mediaClips = xml_dom.getElementsByTagName("MMClips")
185    collectMetadata(mediaClips, "Number-of-'Multi-Media'-Clips: ")
186
187    presFormat = xml_dom.getElementsByTagName("PresentationFormat")
188    collectMetadata(presFormat, "Presentation-Format: ")
189
190
191# method drills into xml when there is more than one element
192# associated with a tag name within a file
193# Secondary check is completed to ignore duplicate entries
194# and to insure label is unique
195#
196# Parameters
197#    label     - to be printed with value of attribute
198#    tag_array - array of minidom objects
199#    tag_name  - value to be obtained
200def drillDown(label, tag_array, tag_name):
201    total_count = range(0, tag_array.length)
202
203    for x in total_count:
204          val = tag_array[x].getAttribute(tag_name)
205          if val:
206
207            if label.startswith("Archive-File"):
208               if not(val in targetArray):
209                  targetArray.append(val)
210                  drillDownOutput(label, len(targetArray), val)
211
212            elif label.endswith("Content-Control"):
213		 if not(val in sdtTagArray):
214                    sdtTagArray.append(val)
215                    drillDownOutput(label, len(sdtTagArray), val)
216
217            elif label.endswith("Content-Control-Id"):
218		 if not(val in sdtIdArray):
219                    sdtIdArray.append(val)
220		    drillDownOutput(label, len(sdtIdArray), val)
221
222            elif label.endswith("Content-Control-Alias"):
223		 if not(val in sdtAliasArray):
224                    sdtAliasArray.append(val)
225		    drillDownOutput(label, len(sdtAliasArray), val)
226
227            elif label.endswith("GUID"):
228		 if not(val in GUID_Array):
229                    GUID_Array.append(val)
230		    drillDownOutput(label, len(GUID_Array), val)
231
232            elif label.endswith("Data-Store-Item-Id"):
233		 if not(val in dataStoreArray):
234                    dataStoreArray.append(val)
235		    drillDownOutput(label, len(dataStoreArray), val)
236
237            elif label.endswith("Image"):
238		 if not(val in imageArray):
239                    imageArray.append(val)
240		    drillDownOutput(label, len(imageArray), val)
241            else:
242                 print label + ":",val
243
244
245# method prints the output in label : value format (DGI)
246# Parameters:
247#    label    - to be printed with text of object
248#    count    - number of times label has appeared
249#               with unique values
250#    value    - value to be printed with label
251def drillDownOutput(label, count, value):
252    print label+ `count` + ":",value
253
254
255# method prints metadata associated with an array
256# of minidom objects, and prints label and value
257# if one exists
258# Parameters:
259#    tagArray - array of minidom objects
260#    label    - to be printed with text of object
261def collectMetadata(tagArray, label):
262    if tagArray and tagArray[0].hasChildNodes():
263        text = tagArray[0].firstChild.wholeText.strip().replace("\r"," ").replace("\n"," ")
264        print label,text
265
266def process(fn):
267    #define and initialize counters
268    import zipfile
269    global targetCounter
270    targetCounter = 0
271
272    if not zipfile.is_zipfile(fn):
273        return
274
275    z = zipfile.ZipFile(fn,mode="r")
276    for f in z.namelist():
277        if f.endswith(".xml") or f.endswith(".rels"):
278            process_xml(z.open(f).read())
279
280
281
282
283#define and initialize arrays/lists for tags
284targetArray = []
285idDefaultArray = []
286revisionIdArray = []
287sdtTagArray = []
288sdtIdArray = []
289sdtAliasArray = []
290imageArray = []
291GUID_Array = []
292dataStoreArray = []
293propertyTextArray = []
294imageArray = []
295
296#start the program here
297if (len(sys.argv) < 2):
298   print "Usage: docx_extractor filename.***x"
299   sys.exit()
300else:
301   if(__name__=="__main__"):
302       process(sys.argv[1])
303
304