1package Text::Extract::Word; 2 3use strict; 4use warnings; 5 6our $VERSION = 0.02; 7 8use base qw(Exporter); 9 10our @EXPORT_OK = qw(get_all_text); 11 12#use Smart::Comments; 13 14use Carp; 15use Encode; 16use POSIX; 17use OLE::Storage_Lite; 18use IO::File; 19use Scalar::Util; 20 21sub new { 22 my ($this, @options) = @_; 23 my $class = ref($this) || $this; 24 25 my $self = { }; 26 bless $self, $class; 27 _initialize($self, @options); 28 return $self; 29} 30 31sub _initialize { 32 my ($self, @options) = @_; 33 my $value = shift(@options); 34 if (@options) { 35 carp("Ignored additional parameters to constructor"); 36 } 37 if (Scalar::Util::openhandle($value)) { 38 $self->{_fh} = $value; 39 } elsif (-e $value) { 40 my $oIo = IO::File->new(); 41 $oIo->open($value, "<") or croak("Can't open $value: $!"); 42 binmode($oIo); 43 $self->{_fh} = $oIo; 44 } else { 45 croak("Invalid parameter to constructor: $value should be a file handle or file name"); 46 } 47 _extract_stream($self); 48} 49 50sub _compare_ranges { 51 my ($range1, $range2) = @_; 52 return ($range1->[0] <=> $range2->[0]); 53} 54 55sub _extract_stream { 56 my ($self) = @_; 57 58 my $fh = $self->{_fh}; 59 my $ofs = OLE::Storage_Lite->new($fh); 60 my $name = encode("UCS-2LE", "WordDocument"); 61 my @pps = $ofs->getPpsSearch([$name], 1, 1); 62 croak("This does not seem to be a Word document") unless (@pps); 63 64 # OK, at this stage, we have the word stream. Now we need to start reading from it. 65 my $data = $pps[0]->{Data}; 66 $self->{_data} = $data; 67 68 my $magic = unpack("v", substr($data, 0x0000, 2)); 69 croak(sprintf("This does not seem to be a Word document, but it is pretending to be one: %x", $magic)) unless ($magic == 0xa5ec); 70 71 my $flags = unpack("v", substr($data, 0x000A, 2)); 72 my $table = ($flags & 0x0200) ? "1Table" : "0Table"; 73 $table = encode("UCS-2LE", $table); 74 75 @pps = $ofs->getPpsSearch([$table], 1, 1); 76 confess("Internal error: could not locate table stream") unless (@pps); 77 78 $table = $pps[0]->{Data}; 79 $self->{_table} = $table; 80 81 my $fcMin = unpack("V", substr($data, 0x0018, 4)); 82 my $ccpText = unpack("V", substr($data, 0x004c, 4)); 83 my $ccpFtn = unpack("V", substr($data, 0x0050, 4)); 84 my $ccpHdd = unpack("V", substr($data, 0x0054, 4)); 85 my $ccpAtn = unpack("V", substr($data, 0x005c, 4)); 86 87 $self->{_fcMin} = $fcMin; 88 $self->{_ccpText} = $ccpText; 89 $self->{_ccpFtn} = $ccpFtn; 90 $self->{_ccpHdd} = $ccpHdd; 91 $self->{_ccpAtn} = $ccpAtn; 92 93 my $charPLC = unpack("V", substr($data, 0x00fa, 4)); 94 my $charPlcSize = unpack("V", substr($data, 0x00fe, 4)); 95 my $parPLC = unpack("V", substr($data, 0x0102, 4)); 96 my $parPlcSize = unpack("V", substr($data, 0x0106, 4)); 97 98 # get the location of the piece table 99 my $complexOffset = unpack("V", substr($data, 0x01a2, 4)); 100 101### fcMin: $fcMin 102### ccpText: $ccpText 103### ccpFtn: $ccpFtn 104### ccpHdd: $ccpHdd 105### ccpAtn: $ccpAtn 106### end: $ccpText + $ccpFtn + $ccpHdd + $ccpAtn 107 108 # Read character positioning data positions 109 my $fcPlcfBteChpx = unpack("V", substr($data, 0x0fa, 4)); 110 my $lcbPlcfBteChpx = unpack("V", substr($data, 0x0fe, 4)); 111 $self->{_fcPlcfBteChpx} = $fcPlcfBteChpx; 112 $self->{_lcbPlcfBteChpx} = $lcbPlcfBteChpx; 113 114 _get_bookmarks($self); 115 116 my @pieces = _find_text(\$table, $complexOffset); 117 @pieces = sort { $a->{start} <=> $b->{start} } @pieces; 118 119 _get_text(\$data, \@pieces); 120 121 $self->{_pieces} = \@pieces; 122} 123 124sub _get_bookmarks { 125 my ($self) = @_; 126 127 # Now to look for bookmark information 128 my $fcSttbfBkmk = unpack("V", substr($self->{_data}, 0x0142, 4)); 129 my $lcbSttbfBkmk = unpack("V", substr($self->{_data}, 0x0146, 4)); 130 my $fcPlcfBkf = unpack("V", substr($self->{_data}, 0x014a, 4)); 131 my $lcbPlcfBkf = unpack("V", substr($self->{_data}, 0x014e, 4)); 132 my $fcPlcfBkl = unpack("V", substr($self->{_data}, 0x0152, 4)); 133 my $lcbPlcfBkl = unpack("V", substr($self->{_data}, 0x0156, 4)); 134### fcSttbfBkmk: $fcSttbfBkmk 135### lcbSttbfBkmk: $lcbSttbfBkmk 136### fcPlcfBkf: $fcPlcfBkf 137### lcbPlcfBkf: $lcbPlcfBkf 138### fcPlcfBkl: $fcPlcfBkl 139### lcbPlcfBkl: $lcbPlcfBkl 140 141 return if ($lcbSttbfBkmk == 0); 142 143 # Read the bookmark name block 144 my $sttbfBkmk = substr($self->{_table}, $fcSttbfBkmk, $lcbSttbfBkmk); 145 my $plcfBkf = substr($self->{_table}, $fcPlcfBkf, $lcbPlcfBkf); 146 my $plcfBkl = substr($self->{_table}, $fcPlcfBkl, $lcbPlcfBkl); 147 148 # Now we can read the bookmark names 149 150 my $fcExtend = unpack("v", substr($sttbfBkmk, 0, 2)); 151 my $cData = unpack("v", substr($sttbfBkmk, 2, 2)); 152 my $cbExtra = unpack("v", substr($sttbfBkmk, 4, 2)); 153 confess("Internal error: unexpected single-byte bookmark data") unless ($fcExtend == 0xffff); 154 155 my $offset = 6; 156 my $index = 0; 157 my %bookmarks = (); 158 while($offset < $lcbSttbfBkmk) { 159 my $length = unpack("v", substr($sttbfBkmk, $offset, 2)); 160 $length = $length * 2; 161 my $string = substr($sttbfBkmk, $offset + 2, $length); 162 my $cpStart = unpack("V", substr($plcfBkf, $index * 4, 4)); 163 my $cpEnd = unpack("V", substr($plcfBkl, $index * 4, 4)); 164 $string = Encode::decode("UCS-2LE", $string); 165### field name: $string 166### position: $cpStart 167### position: $cpEnd 168 $bookmarks{$string} = {start => $cpStart, end => $cpEnd}; 169 $offset += $length + 2; 170 $index++; 171 } 172 173 $self->{_bookmarks} = \%bookmarks; 174} 175 176sub _get_piece { 177 my ($dataref, $piece) = @_; 178 179 my $pstart = $piece->{start}; 180 my $ptotLength = $piece->{totLength}; 181 my $pfilePos = $piece->{filePos}; 182 my $punicode = $piece->{unicode}; 183 184 my $pend = $pstart + $ptotLength; 185 my $textStart = $pfilePos; 186 my $textEnd = $textStart + ($pend - $pstart); 187 188 if ($punicode) { 189 ### Adding ucs2 text... 190 ### Start: $textStart 191 ### End: $textEnd 192 ### Length: $textEnd - $textStart 193 ### Bytes: $ptotLength 194 $piece->{text} = _add_unicode_text($textStart, $textEnd, $dataref); 195 return; 196 } else { 197 ### Adding iso8869 text... 198 ### Start: $textStart 199 ### End: $textEnd 200 ### Length: $textEnd - $textStart 201 ### Bytes: $ptotLength 202 $piece->{text} = _add_text($textStart, $textEnd, $dataref); 203 return; 204 } 205} 206 207sub _get_text { 208 my ($dataref, $piecesref) = @_; 209 210 my @pieces = @$piecesref; 211 my @result = (); 212 my $index = 1; 213 my $position = 0; 214 215 foreach my $piece (@pieces) { 216 217 ### piece: $index++ 218 ### position: $position 219 $piece->{position} = $position; 220 221 _get_piece($dataref, $piece); 222 my $segment = $piece->{text}; 223 push @result, $segment; 224 my $length = length($segment); 225 $piece->{length} = $length; 226 $piece->{endPosition} = $position + $length; 227 228 $position += $length; 229 } 230 231 ### End position: $position 232 return; 233} 234 235sub _add_unicode_text { 236 my ($textStart, $textEnd, $dataref) = @_; 237 238 my $string = substr($$dataref, $textStart, 2*($textEnd - $textStart)); 239 240 my $perl_string = Encode::decode("UCS-2LE", $string); 241 return $perl_string; 242} 243 244sub _add_text { 245 my ($textStart, $textEnd, $dataref) = @_; 246 247 my $string = substr($$dataref, $textStart, $textEnd - $textStart); 248 249 my $perl_string = Encode::decode("iso-8859-1", $string); 250 251 # See the conversion table for FcCompressed structures. Note that these 252 # should not affect positions, as these are characters now, not bytes 253 $perl_string =~ tr[\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9f][\x{201A}\x{0192}\x{201E}\x{2026}\x{2020}\x{2021}\x{02C6}\x{2030}\x{0160}\x{2039}\x{0152}\x{2018}\x{2019}\x{201C}\x{201D}\x{2022}\x{2013}\x{2014}\x{02DC}\x{2122}\x{0161}\x{203A}\x{0153}\x{0178}]; 254 255 return $perl_string; 256} 257 258sub _get_chunks { 259 my ($start, $length, $piecesref) = @_; 260 my @result = (); 261 my $end = $start + $length; 262 263 foreach my $piece (@$piecesref) { 264 my ($pstart, $ptotLength, $pfilePos, $punicode) = @$piece; 265 my $pend = $pstart + $ptotLength; 266 if ($pstart < $end) { 267 if ($start < $pend) { 268 push @result, $piece; 269 } 270 } else { 271 last; 272 } 273 } 274 275 return @result; 276} 277 278sub _find_text { 279 my ($tableref, $pos) = @_; 280 281 my @pieces = (); 282 283 while(unpack("C", substr($$tableref, $pos, 1)) == 1) { 284 $pos++; 285 my $skip = unpack("v", substr($$tableref, $pos, 2)); 286# print STDERR sprintf("Skipping %d\n", $skip); 287 $pos += 2 + $skip; 288 } 289 290 if (unpack("C", substr($$tableref, $pos, 1)) != 2) { 291 confess("Internal error: ccorrupted Word file"); 292 } else { 293 my $pieceTableSize = unpack("V", substr($$tableref, ++$pos, 4)); 294# print STDERR sprintf("pieceTableSize: %d\n", $pieceTableSize); 295 296 $pos += 4; 297 my $pieces = ($pieceTableSize - 4) / 12; 298# print STDERR sprintf("pieces: %d\n", $pieces); 299 my $start = 0; 300 301 for (my $x = 0; $x < $pieces; $x++) { 302 my $filePos = unpack("V", substr($$tableref, $pos + (($pieces + 1) * 4) + ($x * 8) + 2, 4)); 303 my $unicode = 0; 304 if (($filePos & 0x40000000) == 0) { 305 $unicode = 1; 306 } else { 307 $unicode = 0; 308 $filePos &= ~(0x40000000); #gives me FC in doc stream 309 $filePos /= 2; 310 } 311# print STDERR sprintf("filePos: %x\n", $filePos); 312 my $lStart = unpack("V", substr($$tableref, $pos + ($x * 4), 4)); 313 my $lEnd = unpack("V", substr($$tableref, $pos + (($x + 1) * 4), 4)); 314 my $totLength = $lEnd - $lStart; 315 316# print STDERR "lStart: $lStart; lEnd: $lEnd\n"; 317 318# print STDERR ("Piece: " . (1 + $x) . ", start=" . $start 319# . ", len=" . $totLength . ", phys=" . $filePos 320# . ", uni=" .$unicode . "\n"); 321 322 # TextPiece piece = new TextPiece(start, totLength, filePos, unicode); 323 # start = start + totLength; 324 # text.add(piece); 325 326 push @pieces, {start => $start, 327 totLength => $totLength, 328 filePos => $filePos, 329 unicode => $unicode}; 330 $start = $start + (($unicode) ? $totLength/2 : $totLength); 331 } 332 } 333 return @pieces; 334} 335 336sub _get_piece_index { 337 my ($self, $position) = @_; 338 confess("Internal error: invalid position") if (! defined($position)); 339 my $index = 0; 340 foreach my $piece (@{$self->{_pieces}}) { 341 return $index if ($position <= $piece->{endPosition}); 342 $index++; 343 } 344} 345 346sub _get_text_range { 347 my ($self, $start, $end) = @_; 348 349 my $pieces = $self->{_pieces}; 350 my $start_piece = _get_piece_index($self, $start); 351 my $end_piece = _get_piece_index($self, $end); 352 my @result = (); 353 for(my $i = $start_piece; $i <= $end_piece; $i++) { 354 my $piece = $pieces->[$i]; 355 my $xstart = ($i == $start_piece) ? $start - $piece->{position} : 0; 356 my $xend = ($i == $end_piece) ? $end - $piece->{position} : $piece->{endPosition}; 357 push @result, substr($piece->{text}, $xstart, $xend - $xstart); 358 } 359 360 return join("", @result); 361} 362 363sub get_bookmarks { 364 my ($self, $filter) = @_; 365 my $bookmarks = $self->{_bookmarks}; 366 my @bookmark_names = sort keys %$bookmarks; 367 foreach my $name (@bookmark_names) { 368 my $bookmark = $bookmarks->{$name}; 369 next if (exists($bookmark->{value})); 370 my $start = $bookmark->{start}; 371 my $end = $bookmark->{end}; 372 my $value = _get_text_range($self, $start - 1, $end); 373 if (substr($value, 0, 1) ne chr(19)) { 374 $value = substr($value, 1); 375 } 376 $bookmark->{value} = $value; 377 ### name: $name 378 ### value: $value 379 } 380 381 return { map { ($_ => _filter($bookmarks->{$_}->{value}, $filter) ) } @bookmark_names }; 382} 383 384sub get_body { 385 my ($self, $filter) = @_; 386 my $start = 0; 387 return _filter(_get_text_range($self, $start, $start + $self->{_ccpText}), $filter); 388} 389 390sub get_footnotes { 391 my ($self, $filter) = @_; 392 my $start = $self->{_ccpText}; 393 return _filter(_get_text_range($self, $start, $start + $self->{_ccpFtn}), $filter); 394} 395 396sub get_headers { 397 my ($self, $filter) = @_; 398 my $start = $self->{_ccpText} + $self->{_ccpFtn}; 399 return _filter(_get_text_range($self, $start, $start + $self->{_ccpHdd}), $filter); 400} 401 402sub get_annotations { 403 my ($self, $filter) = @_; 404 my $start = $self->{_ccpText} + $self->{_ccpFtn} + $self->{_ccpHdd}; 405 return _filter(_get_text_range($self, $start, $start + $self->{_ccpAtn}), $filter); 406} 407 408sub get_text { 409 my ($self, $filter) = @_; 410 return $self->get_body($filter) . 411 $self->get_footnotes($filter) . 412 $self->get_headers($filter) . 413 $self->get_annotations($filter); 414} 415 416sub _filter { 417 my ($text, $filter) = @_; 418 if (! defined($filter)) { 419 $text =~ tr/\x02\x05\x08//d; 420 $text =~ tr/\x{2018}\x{2019}\x{201c}\x{201d}\x{0007}\x{000d}\x{2002}\x{2003}\x{2012}\x{2013}\x{2014}/''""\t\n \-\-\-/; 421 $text =~ s{\cS(?:[^\cT]*\cT)([^\cU]*)\cU}{$1}g; 422 $text =~ s{\cS(?:[^\cU]*\cU)}{}g; 423 $text =~ s{[\cJ\cM]}{\n}g; 424 } elsif ($filter eq ':raw') { 425 # Do nothing 426 } else { 427 croak("Invalid filter type: $filter"); 428 } 429 return $text; 430} 431 432sub get_all_text { 433 my ($file) = @_; 434 435 my $instance = __PACKAGE__->new($file); 436 437 $instance->get_bookmarks(); 438 return _get_text_range($instance, 0, $instance->{_ccpText} + 439 $instance->{_ccpFtn} + 440 $instance->{_ccpHdd} + 441 $instance->{_ccpAtn}); 442} 443 4441; 445 446=head1 NAME 447 448Text::Extract::Word - Extract text from Word files 449 450=head1 SYNOPSIS 451 452 # object-based interface 453 use Text::Extract::Word; 454 my $file = Text::Extract::Word->new("test1.doc"); 455 my $text = $file->get_text(); 456 my $body = $file->get_body(); 457 my $footnotes = $file->get_footnotes(); 458 my $headers = $file->get_headers(); 459 my $annotations = $file->get_annotations(); 460 my $bookmarks = $file->get_bookmarks(); 461 462 # specify :raw if you don't want the text cleaned 463 my $raw = $file->get_text(':raw'); 464 465 # legacy interface 466 use Text::Extract::Word qw(get_all_text); 467 my $text = get_all_text("test1.doc"); 468 469=head1 DESCRIPTION 470 471This simple module allows the textual contents to be extracted from a Word file. 472The code was ported from Java code, originally part of the Apache POE project, but 473extensive code changes were made internally. 474 475=head1 OBJECT-BASED INTERFACE 476 477=head2 Text::Extract::Word->new($input); 478 479Passed either a file name or an open file handle, this constructor returns an 480instance that can be used to query the file contents. 481 482=head1 METHODS 483 484All the query methods accept an optional filter argument that can take the value 485':raw' -- if this is passed the original Word file contents will be returned without 486any attempt to clean the text. 487 488The default filter attempts to remove Word internal characters used to identify 489fields (including field instructions), and translate common Unicode 'fancy' quotes 490into more conventional ISO-8859-1 equivalents, for ease of processing. Table cell 491markers are also translated into tabs, and paragraph marks into Perl newlines. 492 493=head2 get_body([$filter]) 494 495Returns the text for the main body of the Word document. This excludes headers, 496footers, and annotations. 497 498=head2 get_headers([$filter]) 499 500Returns the header and footer texts for the Word document, as a single scalar 501string. 502 503=head2 get_footnotes([$filter]) 504 505Returns the footnote and endnode texts for the Word document, as a single scalar 506string. 507 508=head2 get_annotations([$filter]) 509 510Returns the annotation texts for the Word document, as a single scalar 511string. 512 513=head2 get_text([$filter]) 514 515Returns the concatenated text from the body, headers, footnotes, and annotations 516of the the Word document, as a single scalar string. 517 518=head2 get_bookmarks([$filter]) 519 520Returns the bookmark texts for the Word document, as a hash reference. The keys 521in the hash are the bookmark names (Word requires that these are unique) and 522the values are the filtered bookmark texts. 523 524This method can be used to get Word form text data out of a Word file. All text fields 525in a Word form will normally be labelled as bookmarks, and will be returned by this 526method. Non-textual form fields (including drop-downs) will not be returned, as these 527are not labelled as bookmarks. 528 529=head1 FUNCTIONS 530 531=head2 get_all_text($filename) 532 533The only function exportable by this module, when called on a file name, returns the 534raw text contents of the Word file. The contents are returned as UTF-8 encoded text. 535This is unfiltered, for compatibility with previous versions of the module. 536 537=head1 TODO 538 539=over 4 540 541=item * 542 543handle non-textual form fields 544 545=back 546 547=head1 BUGS 548 549=over 4 550 551=item * 552 553support for legacy Word - the module does not extract text from Word version 6 or earlier 554 555=back 556 557=head1 SEE ALSO 558 559L<OLE::Storage> also has a script C<lhalw> (Let's Have a Look at Word) which extracts 560text from Word files. This is simply a much smaller module with lighter dependencies, 561using L<OLE::Storage_Lite> for its storage management. 562 563=head1 AUTHOR 564 565Stuart Watt, stuart@morungos.com 566 567=head1 COPYRIGHT 568 569Copyright (c) 2010 Stuart Watt. All rights reserved. 570 571=cut 572 573