1#!/usr/bin/env perl
2
3# docx2txt, a command-line utility to convert Docx documents to text format.
4# Copyright (C) 2008-2014 Sandeep Kumar
5#
6# This program is free software; you can redistribute it and/or modify
7# it under the terms of the GNU General Public License as published by
8# the Free Software Foundation; either version 3 of the License, or
9# (at your option) any later version.
10#
11# This program is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14# GNU General Public License for more details.
15#
16# You should have received a copy of the GNU General Public License
17# along with this program; if not, write to the Free Software
18# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
20#
21# This script extracts text from document.xml contained inside .docx file.
22# Perl v5.10.1 was used for testing this script.
23#
24# Author : Sandeep Kumar (shimple0 -AT- Yahoo .DOT. COM)
25#
26# ChangeLog :
27#
28#    10/08/2008 - Initial version (v0.1)
29#    15/08/2008 - Script takes two arguments [second optional] now and can be
30#                 used independently to extract text from docx file. It accepts
31#                 docx file directly, instead of xml file.
32#    18/08/2008 - Added support for center and right justification of text that
33#                 fits in a line 80 characters wide (adjustable).
34#    03/09/2008 - Fixed the slip in usage message.
35#    12/09/2008 - Slightly changed the script invocation and argument handling
36#                 to incorporate some of the shell script functionality here.
37#                 Added support to handle embedded urls in docx document.
38#    23/09/2008 - Changed #! line to use /usr/bin/env - good suggestion from
39#                 Rene Maroufi (info>AT<maroufi>DOT<net) to reduce user work
40#                 during installation.
41#    31/08/2009 - Added support for handling more escape characters.
42#                 Using OS specific null device to redirect stderr.
43#                 Saving text file in binary mode.
44#    03/09/2009 - Updations based on feedback/suggestions from Sergei Kulakov
45#                 (sergei>AT<dewia>DOT<com).
46#                 - removal of non-document text in between TOC related tags.
47#                 - display of hyperlink alongside linked text user controlled.
48#                 - some character conversion updates
49#    05/09/2009 - Merged cjustify and rjustify into single subroutine justify.
50#                 Added more character conversions.
51#                 Organised conversion mappings in tabular form for speedup and
52#                 easy maintenance.
53#                 Tweaked code to reduce number of passes over document content.
54#    10/09/2009 - For leaner text experience, hyperlink is not displayed if
55#                 hyperlink and hyperlinked text are same, even if user has
56#                 enabled hyperlink display.
57#                 Improved handling of short line justification. Many
58#                 justification tag patterns were not captured earlier.
59#    11/09/2009 - A directory holding the unzipped content of .docx file can
60#                 also be specified as argument to the script, in place of file.
61#    17/09/2009 - Removed trailing slashes from input directory name.
62#                 Updated unzip command invocations to handle path names
63#                 containing spaces.
64#    01/10/2009 - Added support for configuration file.
65#    02/10/2009 - Using single quotes to specify path for unzip command.
66#    04/10/2009 - Corrected configuration option name lineIndent to listIndent.
67#    11/12/2011 - Configuration variables now begin with config_ .
68#                 Configuration file is looked for in HOME directory as well.
69#                 Added a check for existence of unzip command.
70#                 Superscripted cross-references are placed within [...] now.
71#                 Fixed bugs #3003903, #3082018 and #3082035.
72#                 Fixed nullDevice for Cygwin.
73#    12/12/2011 - Configuration file is also looked for in /usr/local/etc, default
74#                 location for Unix-ish systems.
75#    22/12/2011 - Added &apos; and &quot; to docx specific escape characters
76#                 conversions. [Bug #3463033]
77#    24/12/2011 - Improved handling of special (non-text) characters, along with
78#                 support for more non-text characters.
79#    05/01/2012 - Configuration file is now looked for in current directory,
80#                 user configuration directory and system configuration
81#                 directory (in the specified order). This streamlining allows
82#                 for per user configuration file even on Windows.
83#    14/01/2012 - Wrong code was committed during earlier fixing of nullDevice
84#                 for Cygwin, fixed that.
85#                 Usage is extended to accept docx file from standard input.
86#                 "-h" has to be given as the first argument to get usage help.
87#                 Added new configuration variable "config_tempDir".
88#    14/03/2014 - Remove deleted text from output. This effects in case changes
89#                 are being tracked in docx document. Patch was contributed by
90#                 William Parsons (wbparsons>AT<cshore>DOT<com).
91#                 Removed experimental config option config_exp_extra_deEscape.
92#    27/03/2014 - Remove non-document_text content marked by wp/wp14 tags.
93#    07/04/2014 - Added support for handling lists (bullet, decimal, letter,
94#                 roman) along with (attempt at) indentation.
95#                 Added new configuration variable config_twipsPerChar.
96#                 Removed configuration variable config_listIndent.
97#    14/04/2014 - Fixed list numbering - lvl start value needs to be considered.
98#                 Improved list indentation and corresponding code.
99#    27/04/2014 - Improved paragraph content layout/indentation.
100#    13/05/2014 - Added new configuration variable config_unzip_opts. Users can
101#                 now use unzipping programs like 7z, pkzipc, winzip as well.
102#
103
104
105#
106# The default settings below can be overridden via docx2txt.config in current
107# directory/ user configuration directory/ system configuration directory.
108#
109
110our $config_unzip = '/usr/bin/unzip';	# Windows path like 'C:/path/to/unzip.exe'
111our $config_unzip_opts = '-p';		# To extract file on standard output
112
113our $config_newLine = "\n";		# Alternative is "\r\n".
114our $config_lineWidth = 80;		# Line width, used for short line justification.
115our $config_showHyperLink = "N";	# Show hyperlink alongside linked text.
116our $config_tempDir;			# Directory for temporary file creation.
117our $config_twipsPerChar = 120;		# Approx mapping for layout purpose.
118
119
120#
121# Windows/Non-Windows specific settings. Adjust these here, if needed.
122#
123
124if ($ENV{OS} =~ /^Windows/ && !(exists $ENV{OSTYPE} || exists $ENV{HOME})) {
125    $nullDevice = "nul";
126    $userConfigDir = $ENV{APPDATA};
127
128    #
129    # On Windows, configuration file is installed in same folder as this script.
130    #
131    $0 =~ m%^(.*[/\\])[^/\\]*?$%;
132    $systemConfigDir = $1;
133
134    $config_tempDir = "$ENV{TEMP}";
135} else {
136    $nullDevice = "/dev/null";
137    $userConfigDir = $ENV{HOME};
138    $systemConfigDir = "/usr/local/etc";
139
140    $config_tempDir = "/tmp";
141}
142
143
144#
145# Character conversion tables
146#
147
148# Only (amp, apos, gt, lt and quot) are the required reserved characters in HTML
149# and XHTML, others are used for better text experience.
150my %escChrs = (	amp => '&', apos => '\'', gt => '>', lt => '<', quot => '"',
151		acute => '\'', brvbar => '|', copy => '(C)', divide => '/',
152		laquo => '<<', macr => '-', nbsp => ' ', raquo => '>>',
153		reg => '(R)', shy => '-', times => 'x'
154);
155
156my %splchars = (
157    "\xC2" => {
158	"\xA0" => ' ',		# <nbsp> non-breaking space
159	"\xA2" => 'cent',	# <cent>
160	"\xA3" => 'Pound',	# <pound>
161	"\xA5" => 'Yen',	# <yen>
162	"\xA6" => '|',		# <brvbar> broken vertical bar
163#	"\xA7" => '',		# <sect> section
164	"\xA9" => '(C)',	# <copy> copyright
165	"\xAB" => '<<',		# <laquo> angle quotation mark (left)
166	"\xAC" => '-',		# <not> negation
167	"\xAE" => '(R)',	# <reg> registered trademark
168	"\xB1" => '+-',		# <plusmn> plus-or-minus
169	"\xB4" => '\'',		# <acute>
170	"\xB5" => 'u',		# <micro>
171#	"\xB6" => '',		# <para> paragraph
172	"\xBB" => '>>',		# <raquo> angle quotation mark (right)
173	"\xBC" => '(1/4)',	# <frac14> fraction 1/4
174	"\xBD" => '(1/2)',	# <frac12> fraction 1/2
175	"\xBE" => '(3/4)',	# <frac34> fraction 3/4
176    },
177
178    "\xC3" => {
179	"\x97" => 'x',		# <times> multiplication
180	"\xB7" => '/',		# <divide> division
181    },
182
183    "\xCF" => {
184	"\x80" => 'PI',		# <pi>
185    },
186
187    "\xE2\x80" => {
188	"\x82" => '  ',		# <ensp> en space
189	"\x83" => '  ',		# <emsp> em space
190	"\x85" => ' ',		# <qemsp>
191	"\x93" => ' - ',	# <ndash> en dash
192	"\x94" => ' -- ',	# <mdash> em dash
193	"\x95" => '--',		# <horizontal bar>
194	"\x98" => '`',		# <soq>
195	"\x99" => '\'',		# <scq>
196	"\x9C" => '"',		# <doq>
197	"\x9D" => '"',		# <dcq>
198	"\xA2" => '::',		# <diamond symbol>
199	"\xA6" => '...',	# <hellip> horizontal ellipsis
200	"\xB0" => '%.',		# <permil> per mille
201    },
202
203    "\xE2\x82" => {
204	"\xAC" => 'Euro'	# <euro>
205    },
206
207    "\xE2\x84" => {
208	"\x85" => 'c/o',	# <care/of>
209	"\x97" => '(P)',	# <sound recording copyright>
210	"\xA0" => '(SM)',	# <servicemark>
211	"\xA2" => '(TM)',	# <trade> trademark
212	"\xA6" => 'Ohm',	# <Ohm>
213    },
214
215    "\xE2\x85" => {
216	"\x93" => '(1/3)',
217	"\x94" => '(2/3)',
218	"\x95" => '(1/5)',
219	"\x96" => '(2/5)',
220	"\x97" => '(3/5)',
221	"\x98" => '(4/5)',
222	"\x99" => '(1/6)',
223	"\x9B" => '(1/8)',
224	"\x9C" => '(3/8)',
225	"\x9D" => '(5/8)',
226	"\x9E" => '(7/8)',
227	"\x9F" => '1/',
228    },
229
230    "\xE2\x86" => {
231	"\x90" => '<--',	# <larr> left arrow
232	"\x92" => '-->',	# <rarr> right arrow
233	"\x94" => '<-->',	# <harr> left right arrow
234    },
235
236    "\xE2\x88" => {
237	"\x82" => 'd',		# partial differential
238	"\x9E" => 'infinity',
239    },
240
241    "\xE2\x89" => {
242	"\xA0" => '!=',		# <neq>
243	"\xA4" => '<=',		# <leq>
244	"\xA5" => '>=',		# <geq>
245    },
246
247    "\xEF\x82" => {
248	"\xB7" => '*'		# small white square
249    }
250);
251
252
253#
254# Check argument(s) sanity.
255#
256
257my $usage = <<USAGE;
258
259Usage:	$0 [infile.docx|-|-h] [outfile.txt|-]
260	$0 < infile.docx
261	$0 < infile.docx > outfile.txt
262
263	In second usage, output is dumped on STDOUT.
264
265	Use '-h' as the first argument to get this usage information.
266
267	Use '-' as the infile name to read the docx file from STDIN.
268
269	Use '-' as the outfile name to dump the text on STDOUT.
270	Output is saved in infile.txt if second argument is omitted.
271
272Note:	infile.docx can also be a directory name holding the unzipped content
273	of concerned .docx file.
274
275USAGE
276
277die $usage if (@ARGV > 2 || $ARGV[0] eq '-h');
278
279
280#
281# Look for configuration file in current directory/ user configuration
282# directory/ system configuration directory - in the given order.
283#
284
285my %config;
286
287if (-f "docx2txt.config") {
288    %config = do 'docx2txt.config';
289} elsif (-f "$userConfigDir/docx2txt.config") {
290    %config = do "$userConfigDir/docx2txt.config";
291} elsif (-f "$systemConfigDir/docx2txt.config") {
292    %config = do "$systemConfigDir/docx2txt.config";
293}
294
295if (%config) {
296    foreach my $var (keys %config) {
297        $$var = $config{$var};
298    }
299}
300
301#
302# Check for unzip utility, before proceeding further.
303#
304
305die "Failed to locate unzip command '$config_unzip'!\n" if ! -f $config_unzip;
306
307
308#
309# Handle cases where this script reads docx file from STDIN.
310#
311
312if (@ARGV == 0) {
313    $ARGV[0] = '-';
314    $ARGV[1] = '-';
315    $inputFileName = "STDIN";
316} elsif (@ARGV == 1 && $ARGV[0] eq '-') {
317    $ARGV[1] = '-';
318    $inputFileName = "STDIN";
319} else {
320    $inputFileName = $ARGV[0];
321}
322
323if ($ARGV[0] eq '-') {
324    $tempFile = "${config_tempDir}/dx2tTemp_${$}_" . time() . ".docx";
325    open my $fhTemp, "> $tempFile" or die "Can't create temporary file for storing docx file read from STDIN!\n";
326
327    binmode $fhTemp;
328    local $/ = undef;
329    my $docxFileContent = <STDIN>;
330
331    print $fhTemp $docxFileContent;
332    close $fhTemp;
333
334    $ARGV[0] = $tempFile;
335}
336
337
338#
339# Check for existence and readability of required file in specified directory,
340# and whether it is a text file.
341#
342
343sub check_for_required_file_in_folder {
344    stat("$_[1]/$_[0]");
345    die "Can't read <$_[0]> in <$_[1]>!\n" if ! (-f _ && -r _);
346    die "<$_[1]/$_[0]> does not seem to be a text file!\n" if ! -T _;
347}
348
349sub readFileInto {
350    local $/ = undef;
351    open my $fh, "$_[0]" or die "Couldn't read file <$_[0]>!\n";
352    binmode $fh;
353    $_[1] = <$fh>;
354    close $fh;
355}
356
357sub readOptionalFileInto {
358    local $/ = undef;
359
360    stat("$_[0]");
361    if (-f _) {
362        if (-r _ && -T _) {
363            open my $fh, "$_[0]" or die "Couldn't read file <$_[0]>!\n";
364            binmode $fh;
365            $_[1] = <$fh>;
366            close $fh;
367        }
368        else {
369            die "Invalid <$_[0]>!\n";
370        }
371    }
372}
373
374
375
376#
377# Check whether first argument is specifying a directory holding extracted
378# content of .docx file, or .docx file itself.
379#
380
381sub cleandie {
382    unlink("$tempFile") if -e "$tempFile";
383    die "$_[0]";
384}
385
386
387stat($ARGV[0]);
388
389if (-d _) {
390    check_for_required_file_in_folder("word/document.xml", $ARGV[0]);
391    check_for_required_file_in_folder("word/_rels/document.xml.rels", $ARGV[0]);
392    $inpIsDir = 'y';
393}
394else {
395    cleandie "Can't read docx file <$inputFileName>!\n" if ! (-f _ && -r _);
396    cleandie "<$inputFileName> does not seem to be a docx file!\n" if -T _;
397}
398
399
400#
401# Extract xml document content from argument docx file/directory.
402#
403
404my $unzip_cmd = "'$config_unzip' $config_unzip_opts";
405
406if ($inpIsDir eq 'y') {
407    readFileInto("$ARGV[0]/word/document.xml", $content);
408} else {
409    $content = `$unzip_cmd "$ARGV[0]" word/document.xml 2>$nullDevice`;
410}
411
412cleandie "Failed to extract required information from <$inputFileName>!\n" if ! $content;
413
414
415#
416# Be ready for outputting the extracted text contents.
417#
418
419if (@ARGV == 1) {
420     $ARGV[1] = $ARGV[0];
421
422     # Remove any trailing slashes to generate proper output filename, when
423     # input is directory.
424     $ARGV[1] =~ s%[/\\]+$%% if ($inpIsDir eq 'y');
425
426     $ARGV[1] .= ".txt" if !($ARGV[1] =~ s/\.docx$/\.txt/);
427}
428
429my $txtfile;
430open($txtfile, "> $ARGV[1]") || cleandie "Can't create <$ARGV[1]> for output!\n";
431binmode $txtfile;    # Ensure no auto-conversion of '\n' to '\r\n' on Windows.
432
433
434#
435# Gather information about header, footer, hyperlinks, images, footnotes etc.
436#
437
438if ($inpIsDir eq 'y') {
439    readFileInto("$ARGV[0]/word/_rels/document.xml.rels", $_);
440} else {
441    $_ = `$unzip_cmd "$ARGV[0]" word/_rels/document.xml.rels 2>$nullDevice`;
442}
443
444my %docurels;
445while (/<Relationship Id="(.*?)" Type=".*?\/([^\/]*?)" Target="(.*?)"( .*?)?\/>/g)
446{
447    $docurels{"$2:$1"} = $3;
448}
449
450#
451# Gather list numbering information.
452#
453
454$_ = "";
455if ($inpIsDir eq 'y') {
456    readOptionalFileInto("$ARGV[0]/word/numbering.xml", $_);
457} else {
458    $_ = `$unzip_cmd "$ARGV[0]" word/numbering.xml 2>$nullDevice`;
459}
460
461my %abstractNum;
462my @N2ANId = ();
463
464my %NFList = (
465    "bullet"      => \&bullet,
466    "decimal"     => \&decimal,
467    "lowerLetter" => \&lowerLetter,
468    "upperLetter" => \&upperLetter,
469    "lowerRoman"  => \&lowerRoman,
470    "upperRoman"  => \&upperRoman
471);
472
473if ($_) {
474    while (/<w:abstractNum w:abstractNumId="(\d+)">(.*?)<\/w:abstractNum>/g)
475    {
476        my $abstractNumId = $1, $temp = $2;
477
478	while ($temp =~ /<w:lvl w:ilvl="(\d+)"[^>]*><w:start w:val="(\d+)"[^>]*><w:numFmt w:val="(.*?)"[^>]*>.*?<w:lvlText w:val="(.*?)"[^>]*>.*?<w:ind w:left="(\d+)" w:hanging="(\d+)"[^>]*>/g )
479        {
480            # $2: Start $3: NumFmt, $4: LvlText, ($5,$6): (Indent (twips), hanging)
481
482            @{$abstractNum{"$abstractNumId:$1"}} = (
483                $NFList{$3},
484                $4,
485                $2,
486                int ((($5-$6) / $config_twipsPerChar) + 0.5),
487                $5
488            );
489        }
490    }
491
492    while ( /<w:num w:numId="(\d+)"><w:abstractNumId w:val="(\d+)"/g )
493    {
494        $N2ANId[$1] = $2;
495    }
496}
497
498# Remove the temporary file (if) created to store input from STDIN. All the
499# (needed) data is read from it already.
500unlink("$tempFile") if -e "$tempFile";
501
502
503#
504# Subroutines for center and right justification of text in a line.
505#
506
507sub justify {
508    my $len = length $_[1];
509
510    if ($_[0] eq "center" && $len < ($config_lineWidth - 1)) {
511        return ' ' x (($config_lineWidth - $len) / 2) . $_[1];
512    } elsif ($_[0] eq "right" && $len < $config_lineWidth) {
513        return ' ' x ($config_lineWidth - $len) . $_[1];
514    } else {
515        return $_[1];
516    }
517}
518
519#
520# Subroutines for dealing with embedded links and images
521#
522
523sub hyperlink {
524    my $hlrid = $_[0];
525    my $hltext = $_[1];
526    my $hlink = $docurels{"hyperlink:$hlrid"};
527
528    $hltext =~ s/<[^>]*?>//og;
529    $hltext .= " [HYPERLINK: $hlink]" if (lc $config_showHyperLink eq "y" && $hltext ne $hlink);
530
531    return $hltext;
532}
533
534#
535# Subroutines for processing numbering information.
536#
537
538my @RomanNumbers = ( "",
539    "i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii",
540    "xiii", "xiv", "xv", "xvi", "xvii", "xviii", "xix", "xx", "xxi", "xxii",
541    "xxiii", "xxiv", "xxv", "xxvi", "xxvii", "xxviii", "xxix", "xxx", "xxxi",
542    "xxxii", "xxxiii", "xxxiv", "xxxv", "xxxvi", "xxxvii", "xxxviii", "xxxix",
543    "xl", "xli", "xlii", "xliii", "xliv", "xlv", "xlvi", "xlvii", "xlviii",
544    "xlix", "l", "li" );
545
546
547sub lowerRoman {
548    return $RomanNumbers[$_[0]] if ($_[0] < @RomanNumbers);
549
550    @rcode = ("i", "iv", "v", "ix", "x", "xl", "l", "xc", "c", "cd", "d", "cm", "m");
551    @dval = (1, 4, 5, 9, 10, 40, 50, 90, 100, 400, 500, 900, 1000);
552
553    my $roman = "";
554    my $num = $_[0];
555
556    my $div, $i = (@rcode - 1);
557    while ($num > 0) {
558        $i-- while ($num < $dval[$i]);
559        $div = $num / $dval[$i];
560        $num = $num % $dval[$i];
561        $roman .= $rcode[$i] x $div;
562    }
563
564    return $roman;
565}
566
567sub upperRoman {
568    return uc lowerRoman(@_);
569}
570
571
572sub lowerLetter {
573    @Alphabets = split '' , "abcdefghijklmnopqrstuvwxyz";
574    return $Alphabets[($_[0] % 26) - 1] x (($_[0] - 1)/26 + 1);
575}
576
577sub upperLetter {
578    return uc lowerLetter(@_);
579}
580
581
582sub decimal {
583    return $_[0];
584}
585
586
587my %bullets = (
588    "\x6F" => 'o',
589    "\xEF\x81\xB6" => '::',	# Diamond
590    "\xEF\x82\xA7" => '#',	# Small Black Square
591    "\xEF\x82\xB7" => '*',	# Small Black Circle
592    "\xEF\x83\x98" => '>',	# Arrowhead
593    "\xEF\x83\xBC" => '+'	# Right Sign
594);
595
596sub bullet {
597    return $bullets{$_[0]} ? $bullets{$_[0]} : 'oo';
598}
599
600my @lastCnt = (0);
601my @twipStack = (0);
602my @keyStack = (undef);
603my $ssiz = 1;
604
605sub listNumbering {
606    my $aref = \@{$abstractNum{"$N2ANId[$_[0]]:$_[1]"}};
607    my $lvlText;
608
609    if ($aref->[0] != \&bullet) {
610        my $key = "$N2ANId[$_[0]]:$_[1]";
611        my $ccnt;
612
613        if ($aref->[4] < $twipStack[$ssiz-1]) {
614            while ($twipStack[$ssiz-1] > $aref->[4]) {
615                pop @twipStack;
616                pop @keyStack;
617                pop @lastCnt;
618                $ssiz--;
619            }
620        }
621
622        if ($aref->[4] == $twipStack[$ssiz-1]) {
623            if ($key eq $keyStack[$ssiz-1]) {
624                ++$lastCnt[$ssiz-1];
625            }
626            else {
627                $keyStack[$ssiz-1] = $key;
628                $lastCnt[$ssiz-1] = $aref->[2];
629            }
630        }
631        elsif ($aref->[4] > $twipStack[$ssiz-1]) {
632            push @twipStack, $aref->[4];
633            push @keyStack, $key;
634            push @lastCnt, $aref->[2];
635            $ssiz++;
636        }
637
638        $ccnt = $lastCnt[$ssiz-1];
639
640        $lvlText = $aref->[1];
641        $lvlText =~ s/%\d([^%]*)$/($aref->[0]->($ccnt)).$1/oe;
642
643        my $i = $ssiz - 2;
644        $i-- while ($lvlText =~ s/%\d([^%]*)$/$lastCnt[$i]$1/o);
645    }
646    else {
647        $lvlText = $aref->[0]->($aref->[1]);
648    }
649
650    return ' ' x $aref->[3] . $lvlText . ' ';
651}
652
653#
654# Subroutines for processing paragraph content.
655#
656
657sub processParagraph {
658    my $para = $_[0] . "$config_newLine";
659    my $align = $1 if ($_[0] =~ /<w:jc w:val="([^"]*?)"\/>/);
660
661    $para =~ s/<.*?>//og;
662    return justify($align,$para) if $align;
663
664    return $para;
665}
666
667#
668# Text extraction starts.
669#
670
671my %tag2chr = (tab => "\t", noBreakHyphen => "-", softHyphen => " - ");
672
673$content =~ s/<?xml .*?\?>(\r)?\n//;
674
675$content =~ s{<(wp14|wp):[^>]*>.*?</\1:[^>]*>}||og;
676
677# Remove the field instructions (instrText) and data (fldData), and deleted
678# text.
679$content =~ s{<w:(instrText|fldData|delText)[^>]*>.*?</w:\1>}||ogs;
680
681# Mark cross-reference superscripting within [...].
682$content =~ s|<w:vertAlign w:val="superscript"/></w:rPr><w:t>(.*?)</w:t>|[$1]|og;
683
684$content =~ s{<w:(tab|noBreakHyphen|softHyphen)/>}|$tag2chr{$1}|og;
685
686my $hr = '-' x $config_lineWidth . $config_newLine;
687$content =~ s|<w:pBdr>.*?</w:pBdr>|$hr|og;
688
689$content =~ s{<w:caps/>.*?(<w:t>|<w:t [^>]+>)(.*?)</w:t>}/uc $2/oge;
690
691$content =~ s{<w:hyperlink r:id="(.*?)".*?>(.*?)</w:hyperlink>}/hyperlink($1,$2)/oge;
692
693$content =~ s|<w:numPr><w:ilvl w:val="(\d+)"/><w:numId w:val="(\d+)"\/>|listNumbering($2,$1)|oge;
694
695$content =~ s{<w:ind w:(left|firstLine)="(\d+)"( w:hanging="(\d+)")?[^>]*>}|' ' x int((($2-$4)/$config_twipsPerChar)+0.5)|oge;
696
697$content =~ s{<w:p [^/>]+?/>|<w:br/>}|$config_newLine|og;
698
699$content =~ s/<w:p[^>]+?>(.*?)<\/w:p>/processParagraph($1)/ogse;
700
701$content =~ s/<.*?>//og;
702
703
704#
705# Convert non-ASCII characters/character sequences to ASCII characters.
706#
707
708$content =~ s/(\xC2|\xC3|\xCF|\xE2.|\xEF.)(.)/($splchars{$1}{$2} ? $splchars{$1}{$2} : $1.$2)/oge;
709
710#
711# Convert docx specific (reserved HTML/XHTML) escape characters.
712#
713$content =~ s/(&)(amp|apos|gt|lt|quot)(;)/$escChrs{lc $2}/iog;
714
715#
716# Write the extracted and converted text contents to output.
717#
718
719print $txtfile $content;
720close $txtfile;
721
722