1package Text::Extract::Word;
2
3use strict;
4use warnings;
5
6our $VERSION = 0.02;
7
8use base qw(Exporter);
9
10our @EXPORT_OK = qw(get_all_text);
11
12#use Smart::Comments;
13
14use Carp;
15use Encode;
16use POSIX;
17use OLE::Storage_Lite;
18use IO::File;
19use Scalar::Util;
20
21sub new {
22    my ($this, @options) = @_;
23    my $class = ref($this) || $this;
24
25    my $self = { };
26    bless $self, $class;
27    _initialize($self, @options);
28    return $self;
29}
30
31sub _initialize {
32    my ($self, @options) = @_;
33    my $value = shift(@options);
34    if (@options) {
35        carp("Ignored additional parameters to constructor");
36    }
37    if (Scalar::Util::openhandle($value)) {
38        $self->{_fh} = $value;
39    } elsif (-e $value) {
40        my $oIo = IO::File->new();
41        $oIo->open($value, "<") or croak("Can't open $value: $!");
42        binmode($oIo);
43        $self->{_fh} = $oIo;
44    } else {
45        croak("Invalid parameter to constructor: $value should be a file handle or file name");
46    }
47    _extract_stream($self);
48}
49
50sub _compare_ranges {
51    my ($range1, $range2) = @_;
52    return ($range1->[0] <=> $range2->[0]);
53}
54
55sub _extract_stream {
56    my ($self) = @_;
57
58    my $fh = $self->{_fh};
59    my $ofs = OLE::Storage_Lite->new($fh);
60    my $name = encode("UCS-2LE", "WordDocument");
61    my @pps = $ofs->getPpsSearch([$name], 1, 1);
62    croak("This does not seem to be a Word document") unless (@pps);
63
64    # OK, at this stage, we have the word stream. Now we need to start reading from it.
65    my $data = $pps[0]->{Data};
66    $self->{_data} = $data;
67
68    my $magic = unpack("v", substr($data, 0x0000, 2));
69    croak(sprintf("This does not seem to be a Word document, but it is pretending to be one: %x", $magic)) unless ($magic == 0xa5ec);
70
71    my $flags = unpack("v", substr($data, 0x000A, 2));
72    my $table = ($flags & 0x0200) ? "1Table" : "0Table";
73    $table = encode("UCS-2LE", $table);
74
75    @pps = $ofs->getPpsSearch([$table], 1, 1);
76    confess("Internal error: could not locate table stream") unless (@pps);
77
78    $table = $pps[0]->{Data};
79    $self->{_table} = $table;
80
81    my $fcMin = unpack("V", substr($data, 0x0018, 4));
82    my $ccpText = unpack("V", substr($data, 0x004c, 4));
83    my $ccpFtn = unpack("V", substr($data, 0x0050, 4));
84    my $ccpHdd = unpack("V", substr($data, 0x0054, 4));
85    my $ccpAtn = unpack("V", substr($data, 0x005c, 4));
86
87    $self->{_fcMin} = $fcMin;
88    $self->{_ccpText} = $ccpText;
89    $self->{_ccpFtn} = $ccpFtn;
90    $self->{_ccpHdd} = $ccpHdd;
91    $self->{_ccpAtn} = $ccpAtn;
92
93    my $charPLC = unpack("V", substr($data, 0x00fa, 4));
94    my $charPlcSize = unpack("V", substr($data, 0x00fe, 4));
95    my $parPLC = unpack("V", substr($data, 0x0102, 4));
96    my $parPlcSize = unpack("V", substr($data, 0x0106, 4));
97
98    # get the location of the piece table
99    my $complexOffset = unpack("V", substr($data, 0x01a2, 4));
100
101### fcMin:   $fcMin
102### ccpText: $ccpText
103### ccpFtn:  $ccpFtn
104### ccpHdd:  $ccpHdd
105### ccpAtn:  $ccpAtn
106### end:     $ccpText + $ccpFtn + $ccpHdd + $ccpAtn
107
108    # Read character positioning data positions
109    my $fcPlcfBteChpx = unpack("V", substr($data, 0x0fa, 4));
110    my $lcbPlcfBteChpx = unpack("V", substr($data, 0x0fe, 4));
111    $self->{_fcPlcfBteChpx} = $fcPlcfBteChpx;
112    $self->{_lcbPlcfBteChpx} = $lcbPlcfBteChpx;
113
114    _get_bookmarks($self);
115
116    my @pieces = _find_text(\$table, $complexOffset);
117    @pieces = sort { $a->{start} <=> $b->{start} } @pieces;
118
119    _get_text(\$data, \@pieces);
120
121    $self->{_pieces} = \@pieces;
122}
123
124sub _get_bookmarks {
125    my ($self) = @_;
126
127    # Now to look for bookmark information
128    my $fcSttbfBkmk = unpack("V", substr($self->{_data}, 0x0142, 4));
129    my $lcbSttbfBkmk = unpack("V", substr($self->{_data}, 0x0146, 4));
130    my $fcPlcfBkf = unpack("V", substr($self->{_data}, 0x014a, 4));
131    my $lcbPlcfBkf = unpack("V", substr($self->{_data}, 0x014e, 4));
132    my $fcPlcfBkl = unpack("V", substr($self->{_data}, 0x0152, 4));
133    my $lcbPlcfBkl = unpack("V", substr($self->{_data}, 0x0156, 4));
134### fcSttbfBkmk:  $fcSttbfBkmk
135### lcbSttbfBkmk: $lcbSttbfBkmk
136### fcPlcfBkf:    $fcPlcfBkf
137### lcbPlcfBkf:   $lcbPlcfBkf
138### fcPlcfBkl:    $fcPlcfBkl
139### lcbPlcfBkl:   $lcbPlcfBkl
140
141    return if ($lcbSttbfBkmk == 0);
142
143    # Read the bookmark name block
144    my $sttbfBkmk = substr($self->{_table}, $fcSttbfBkmk, $lcbSttbfBkmk);
145    my $plcfBkf = substr($self->{_table}, $fcPlcfBkf, $lcbPlcfBkf);
146    my $plcfBkl = substr($self->{_table}, $fcPlcfBkl, $lcbPlcfBkl);
147
148    # Now we can read the bookmark names
149
150    my $fcExtend = unpack("v", substr($sttbfBkmk, 0, 2));
151    my $cData = unpack("v", substr($sttbfBkmk, 2, 2));
152    my $cbExtra = unpack("v", substr($sttbfBkmk, 4, 2));
153    confess("Internal error: unexpected single-byte bookmark data") unless ($fcExtend == 0xffff);
154
155    my $offset = 6;
156    my $index = 0;
157    my %bookmarks = ();
158    while($offset < $lcbSttbfBkmk) {
159        my $length = unpack("v", substr($sttbfBkmk, $offset, 2));
160        $length = $length * 2;
161        my $string = substr($sttbfBkmk, $offset + 2, $length);
162        my $cpStart = unpack("V", substr($plcfBkf, $index * 4, 4));
163        my $cpEnd = unpack("V", substr($plcfBkl, $index * 4, 4));
164        $string = Encode::decode("UCS-2LE", $string);
165### field name: $string
166### position:   $cpStart
167### position:   $cpEnd
168        $bookmarks{$string} = {start => $cpStart, end => $cpEnd};
169        $offset += $length + 2;
170        $index++;
171    }
172
173    $self->{_bookmarks} = \%bookmarks;
174}
175
176sub _get_piece {
177    my ($dataref, $piece) = @_;
178
179    my $pstart = $piece->{start};
180    my $ptotLength = $piece->{totLength};
181    my $pfilePos = $piece->{filePos};
182    my $punicode = $piece->{unicode};
183
184    my $pend = $pstart + $ptotLength;
185    my $textStart = $pfilePos;
186    my $textEnd = $textStart + ($pend - $pstart);
187
188    if ($punicode) {
189        ### Adding ucs2 text...
190        ### Start: $textStart
191        ### End: $textEnd
192        ### Length: $textEnd - $textStart
193        ### Bytes: $ptotLength
194        $piece->{text} = _add_unicode_text($textStart, $textEnd, $dataref);
195        return;
196    } else {
197        ### Adding iso8869 text...
198        ### Start: $textStart
199        ### End: $textEnd
200        ### Length: $textEnd - $textStart
201        ### Bytes: $ptotLength
202        $piece->{text} = _add_text($textStart, $textEnd, $dataref);
203        return;
204    }
205}
206
207sub _get_text {
208    my ($dataref, $piecesref) = @_;
209
210    my @pieces = @$piecesref;
211    my @result = ();
212    my $index = 1;
213    my $position = 0;
214
215    foreach my $piece (@pieces) {
216
217        ### piece: $index++
218        ### position: $position
219        $piece->{position} = $position;
220
221        _get_piece($dataref, $piece);
222        my $segment = $piece->{text};
223        push @result, $segment;
224        my $length = length($segment);
225        $piece->{length} = $length;
226        $piece->{endPosition} = $position + $length;
227
228        $position += $length;
229    }
230
231    ### End position: $position
232    return;
233}
234
235sub _add_unicode_text {
236    my ($textStart, $textEnd, $dataref) = @_;
237
238    my $string = substr($$dataref, $textStart, 2*($textEnd - $textStart));
239
240    my $perl_string = Encode::decode("UCS-2LE", $string);
241    return $perl_string;
242}
243
244sub _add_text {
245    my ($textStart, $textEnd, $dataref) = @_;
246
247    my $string = substr($$dataref, $textStart, $textEnd - $textStart);
248
249    my $perl_string = Encode::decode("iso-8859-1", $string);
250
251    # See the conversion table for FcCompressed structures. Note that these
252    # should not affect positions, as these are characters now, not bytes
253    $perl_string =~ tr[\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9f][\x{201A}\x{0192}\x{201E}\x{2026}\x{2020}\x{2021}\x{02C6}\x{2030}\x{0160}\x{2039}\x{0152}\x{2018}\x{2019}\x{201C}\x{201D}\x{2022}\x{2013}\x{2014}\x{02DC}\x{2122}\x{0161}\x{203A}\x{0153}\x{0178}];
254
255    return $perl_string;
256}
257
258sub _get_chunks {
259    my ($start, $length, $piecesref) = @_;
260    my @result = ();
261    my $end = $start + $length;
262
263    foreach my $piece (@$piecesref) {
264        my ($pstart, $ptotLength, $pfilePos, $punicode) = @$piece;
265        my $pend = $pstart + $ptotLength;
266        if ($pstart < $end) {
267            if ($start < $pend) {
268                push @result, $piece;
269            }
270        } else {
271            last;
272        }
273    }
274
275    return @result;
276}
277
278sub _find_text {
279    my ($tableref, $pos) = @_;
280
281    my @pieces = ();
282
283    while(unpack("C", substr($$tableref, $pos, 1)) == 1) {
284        $pos++;
285        my $skip = unpack("v", substr($$tableref, $pos, 2));
286#        print STDERR sprintf("Skipping %d\n", $skip);
287        $pos += 2 + $skip;
288    }
289
290    if (unpack("C", substr($$tableref, $pos, 1)) != 2) {
291         confess("Internal error: ccorrupted Word file");
292    } else {
293        my $pieceTableSize = unpack("V", substr($$tableref, ++$pos, 4));
294#        print STDERR sprintf("pieceTableSize: %d\n", $pieceTableSize);
295
296        $pos += 4;
297        my $pieces = ($pieceTableSize - 4) / 12;
298#        print STDERR sprintf("pieces: %d\n", $pieces);
299        my $start = 0;
300
301        for (my $x = 0; $x < $pieces; $x++) {
302            my $filePos = unpack("V", substr($$tableref, $pos + (($pieces + 1) * 4) + ($x * 8) + 2, 4));
303            my $unicode = 0;
304            if (($filePos & 0x40000000) == 0) {
305                $unicode = 1;
306            } else {
307                $unicode = 0;
308                $filePos &= ~(0x40000000); #gives me FC in doc stream
309                $filePos /= 2;
310            }
311#            print STDERR sprintf("filePos: %x\n", $filePos);
312            my $lStart = unpack("V", substr($$tableref, $pos + ($x * 4), 4));
313            my $lEnd = unpack("V", substr($$tableref, $pos + (($x + 1) * 4), 4));
314            my $totLength = $lEnd - $lStart;
315
316#            print STDERR "lStart: $lStart; lEnd: $lEnd\n";
317
318#            print STDERR ("Piece: " . (1 + $x) . ", start=" . $start
319#                            . ", len=" . $totLength . ", phys=" . $filePos
320#                            . ", uni=" .$unicode . "\n");
321
322            # TextPiece piece = new TextPiece(start, totLength, filePos, unicode);
323            # start = start + totLength;
324            # text.add(piece);
325
326            push @pieces, {start => $start,
327                           totLength => $totLength,
328                           filePos => $filePos,
329                           unicode => $unicode};
330            $start = $start + (($unicode) ? $totLength/2 : $totLength);
331        }
332    }
333    return @pieces;
334}
335
336sub _get_piece_index {
337    my ($self, $position) = @_;
338    confess("Internal error: invalid position") if (! defined($position));
339    my $index = 0;
340    foreach my $piece (@{$self->{_pieces}}) {
341        return $index if ($position <= $piece->{endPosition});
342        $index++;
343    }
344}
345
346sub _get_text_range {
347    my ($self, $start, $end) = @_;
348
349    my $pieces = $self->{_pieces};
350    my $start_piece = _get_piece_index($self, $start);
351    my $end_piece = _get_piece_index($self, $end);
352    my @result = ();
353    for(my $i = $start_piece; $i <= $end_piece; $i++) {
354        my $piece = $pieces->[$i];
355        my $xstart = ($i == $start_piece) ? $start - $piece->{position} : 0;
356        my $xend = ($i == $end_piece) ? $end - $piece->{position} : $piece->{endPosition};
357        push @result, substr($piece->{text}, $xstart, $xend - $xstart);
358    }
359
360    return join("", @result);
361}
362
363sub get_bookmarks {
364    my ($self, $filter) = @_;
365    my $bookmarks = $self->{_bookmarks};
366    my @bookmark_names = sort keys %$bookmarks;
367    foreach my $name (@bookmark_names) {
368        my $bookmark = $bookmarks->{$name};
369        next if (exists($bookmark->{value}));
370        my $start = $bookmark->{start};
371        my $end = $bookmark->{end};
372        my $value = _get_text_range($self, $start - 1, $end);
373        if (substr($value, 0, 1) ne chr(19)) {
374            $value = substr($value, 1);
375        }
376        $bookmark->{value} = $value;
377        ### name: $name
378        ### value: $value
379    }
380
381    return { map { ($_ => _filter($bookmarks->{$_}->{value}, $filter) ) } @bookmark_names };
382}
383
384sub get_body {
385    my ($self, $filter) = @_;
386    my $start = 0;
387    return _filter(_get_text_range($self, $start, $start + $self->{_ccpText}), $filter);
388}
389
390sub get_footnotes {
391    my ($self, $filter) = @_;
392    my $start = $self->{_ccpText};
393    return _filter(_get_text_range($self, $start, $start + $self->{_ccpFtn}), $filter);
394}
395
396sub get_headers {
397    my ($self, $filter) = @_;
398    my $start = $self->{_ccpText} + $self->{_ccpFtn};
399    return _filter(_get_text_range($self, $start, $start + $self->{_ccpHdd}), $filter);
400}
401
402sub get_annotations {
403    my ($self, $filter) = @_;
404    my $start = $self->{_ccpText} + $self->{_ccpFtn} + $self->{_ccpHdd};
405    return _filter(_get_text_range($self, $start, $start + $self->{_ccpAtn}), $filter);
406}
407
408sub get_text {
409    my ($self, $filter) = @_;
410    return $self->get_body($filter) .
411           $self->get_footnotes($filter) .
412           $self->get_headers($filter) .
413           $self->get_annotations($filter);
414}
415
416sub _filter {
417    my ($text, $filter) = @_;
418    if (! defined($filter)) {
419        $text =~ tr/\x02\x05\x08//d;
420        $text =~ tr/\x{2018}\x{2019}\x{201c}\x{201d}\x{0007}\x{000d}\x{2002}\x{2003}\x{2012}\x{2013}\x{2014}/''""\t\n  \-\-\-/;
421        $text =~ s{\cS(?:[^\cT]*\cT)([^\cU]*)\cU}{$1}g;
422        $text =~ s{\cS(?:[^\cU]*\cU)}{}g;
423        $text =~ s{[\cJ\cM]}{\n}g;
424    } elsif ($filter eq ':raw') {
425        # Do nothing
426    } else {
427        croak("Invalid filter type: $filter");
428    }
429    return $text;
430}
431
432sub get_all_text {
433    my ($file) = @_;
434
435    my $instance = __PACKAGE__->new($file);
436
437    $instance->get_bookmarks();
438    return _get_text_range($instance, 0, $instance->{_ccpText} +
439                                         $instance->{_ccpFtn} +
440                                         $instance->{_ccpHdd} +
441                                         $instance->{_ccpAtn});
442}
443
4441;
445
446=head1 NAME
447
448Text::Extract::Word - Extract text from Word files
449
450=head1 SYNOPSIS
451
452 # object-based interface
453 use Text::Extract::Word;
454 my $file = Text::Extract::Word->new("test1.doc");
455 my $text = $file->get_text();
456 my $body = $file->get_body();
457 my $footnotes = $file->get_footnotes();
458 my $headers = $file->get_headers();
459 my $annotations = $file->get_annotations();
460 my $bookmarks = $file->get_bookmarks();
461
462 # specify :raw if you don't want the text cleaned
463 my $raw = $file->get_text(':raw');
464
465 # legacy interface
466 use Text::Extract::Word qw(get_all_text);
467 my $text = get_all_text("test1.doc");
468
469=head1 DESCRIPTION
470
471This simple module allows the textual contents to be extracted from a Word file.
472The code was ported from Java code, originally part of the Apache POE project, but
473extensive code changes were made internally.
474
475=head1 OBJECT-BASED INTERFACE
476
477=head2 Text::Extract::Word->new($input);
478
479Passed either a file name or an open file handle, this constructor returns an
480instance that can be used to query the file contents.
481
482=head1 METHODS
483
484All the query methods accept an optional filter argument that can take the value
485':raw' -- if this is passed the original Word file contents will be returned without
486any attempt to clean the text.
487
488The default filter attempts to remove Word internal characters used to identify
489fields (including field instructions), and translate common Unicode 'fancy' quotes
490into more conventional ISO-8859-1 equivalents, for ease of processing. Table cell
491markers are also translated into tabs, and paragraph marks into Perl newlines.
492
493=head2 get_body([$filter])
494
495Returns the text for the main body of the Word document. This excludes headers,
496footers, and annotations.
497
498=head2 get_headers([$filter])
499
500Returns the header and footer texts for the Word document, as a single scalar
501string.
502
503=head2 get_footnotes([$filter])
504
505Returns the footnote and endnode texts for the Word document, as a single scalar
506string.
507
508=head2 get_annotations([$filter])
509
510Returns the annotation texts for the Word document, as a single scalar
511string.
512
513=head2 get_text([$filter])
514
515Returns the concatenated text from the body, headers, footnotes, and annotations
516of the the Word document, as a single scalar string.
517
518=head2 get_bookmarks([$filter])
519
520Returns the bookmark texts for the Word document, as a hash reference. The keys
521in the hash are the bookmark names (Word requires that these are unique) and
522the values are the filtered bookmark texts.
523
524This method can be used to get Word form text data out of a Word file. All text fields
525in a Word form will normally be labelled as bookmarks, and will be returned by this
526method. Non-textual form fields (including drop-downs) will not be returned, as these
527are not labelled as bookmarks.
528
529=head1 FUNCTIONS
530
531=head2 get_all_text($filename)
532
533The only function exportable by this module, when called on a file name, returns the
534raw text contents of the Word file. The contents are returned as UTF-8 encoded text.
535This is unfiltered, for compatibility with previous versions of the module.
536
537=head1 TODO
538
539=over 4
540
541=item *
542
543handle non-textual form fields
544
545=back
546
547=head1 BUGS
548
549=over 4
550
551=item *
552
553support for legacy Word - the module does not extract text from Word version 6 or earlier
554
555=back
556
557=head1 SEE ALSO
558
559L<OLE::Storage> also has a script C<lhalw> (Let's Have a Look at Word) which extracts
560text from Word files. This is simply a much smaller module with lighter dependencies,
561using L<OLE::Storage_Lite> for its storage management.
562
563=head1 AUTHOR
564
565Stuart Watt, stuart@morungos.com
566
567=head1 COPYRIGHT
568
569Copyright (c) 2010 Stuart Watt. All rights reserved.
570
571=cut
572
573