1#!/usr/bin/env perl 2 3# docx2txt, a command-line utility to convert Docx documents to text format. 4# Copyright (C) 2008-2014 Sandeep Kumar 5# 6# This program is free software; you can redistribute it and/or modify 7# it under the terms of the GNU General Public License as published by 8# the Free Software Foundation; either version 3 of the License, or 9# (at your option) any later version. 10# 11# This program is distributed in the hope that it will be useful, 12# but WITHOUT ANY WARRANTY; without even the implied warranty of 13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14# GNU General Public License for more details. 15# 16# You should have received a copy of the GNU General Public License 17# along with this program; if not, write to the Free Software 18# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 20# 21# This script extracts text from document.xml contained inside .docx file. 22# Perl v5.10.1 was used for testing this script. 23# 24# Author : Sandeep Kumar (shimple0 -AT- Yahoo .DOT. COM) 25# 26# ChangeLog : 27# 28# 10/08/2008 - Initial version (v0.1) 29# 15/08/2008 - Script takes two arguments [second optional] now and can be 30# used independently to extract text from docx file. It accepts 31# docx file directly, instead of xml file. 32# 18/08/2008 - Added support for center and right justification of text that 33# fits in a line 80 characters wide (adjustable). 34# 03/09/2008 - Fixed the slip in usage message. 35# 12/09/2008 - Slightly changed the script invocation and argument handling 36# to incorporate some of the shell script functionality here. 37# Added support to handle embedded urls in docx document. 38# 23/09/2008 - Changed #! line to use /usr/bin/env - good suggestion from 39# Rene Maroufi (info>AT<maroufi>DOT<net) to reduce user work 40# during installation. 41# 31/08/2009 - Added support for handling more escape characters. 42# Using OS specific null device to redirect stderr. 43# Saving text file in binary mode. 44# 03/09/2009 - Updations based on feedback/suggestions from Sergei Kulakov 45# (sergei>AT<dewia>DOT<com). 46# - removal of non-document text in between TOC related tags. 47# - display of hyperlink alongside linked text user controlled. 48# - some character conversion updates 49# 05/09/2009 - Merged cjustify and rjustify into single subroutine justify. 50# Added more character conversions. 51# Organised conversion mappings in tabular form for speedup and 52# easy maintenance. 53# Tweaked code to reduce number of passes over document content. 54# 10/09/2009 - For leaner text experience, hyperlink is not displayed if 55# hyperlink and hyperlinked text are same, even if user has 56# enabled hyperlink display. 57# Improved handling of short line justification. Many 58# justification tag patterns were not captured earlier. 59# 11/09/2009 - A directory holding the unzipped content of .docx file can 60# also be specified as argument to the script, in place of file. 61# 17/09/2009 - Removed trailing slashes from input directory name. 62# Updated unzip command invocations to handle path names 63# containing spaces. 64# 01/10/2009 - Added support for configuration file. 65# 02/10/2009 - Using single quotes to specify path for unzip command. 66# 04/10/2009 - Corrected configuration option name lineIndent to listIndent. 67# 11/12/2011 - Configuration variables now begin with config_ . 68# Configuration file is looked for in HOME directory as well. 69# Added a check for existence of unzip command. 70# Superscripted cross-references are placed within [...] now. 71# Fixed bugs #3003903, #3082018 and #3082035. 72# Fixed nullDevice for Cygwin. 73# 12/12/2011 - Configuration file is also looked for in /usr/local/etc, default 74# location for Unix-ish systems. 75# 22/12/2011 - Added ' and " to docx specific escape characters 76# conversions. [Bug #3463033] 77# 24/12/2011 - Improved handling of special (non-text) characters, along with 78# support for more non-text characters. 79# 05/01/2012 - Configuration file is now looked for in current directory, 80# user configuration directory and system configuration 81# directory (in the specified order). This streamlining allows 82# for per user configuration file even on Windows. 83# 14/01/2012 - Wrong code was committed during earlier fixing of nullDevice 84# for Cygwin, fixed that. 85# Usage is extended to accept docx file from standard input. 86# "-h" has to be given as the first argument to get usage help. 87# Added new configuration variable "config_tempDir". 88# 14/03/2014 - Remove deleted text from output. This effects in case changes 89# are being tracked in docx document. Patch was contributed by 90# William Parsons (wbparsons>AT<cshore>DOT<com). 91# Removed experimental config option config_exp_extra_deEscape. 92# 27/03/2014 - Remove non-document_text content marked by wp/wp14 tags. 93# 07/04/2014 - Added support for handling lists (bullet, decimal, letter, 94# roman) along with (attempt at) indentation. 95# Added new configuration variable config_twipsPerChar. 96# Removed configuration variable config_listIndent. 97# 14/04/2014 - Fixed list numbering - lvl start value needs to be considered. 98# Improved list indentation and corresponding code. 99# 27/04/2014 - Improved paragraph content layout/indentation. 100# 13/05/2014 - Added new configuration variable config_unzip_opts. Users can 101# now use unzipping programs like 7z, pkzipc, winzip as well. 102# 103 104 105# 106# The default settings below can be overridden via docx2txt.config in current 107# directory/ user configuration directory/ system configuration directory. 108# 109 110our $config_unzip = '/usr/bin/unzip'; # Windows path like 'C:/path/to/unzip.exe' 111our $config_unzip_opts = '-p'; # To extract file on standard output 112 113our $config_newLine = "\n"; # Alternative is "\r\n". 114our $config_lineWidth = 80; # Line width, used for short line justification. 115our $config_showHyperLink = "N"; # Show hyperlink alongside linked text. 116our $config_tempDir; # Directory for temporary file creation. 117our $config_twipsPerChar = 120; # Approx mapping for layout purpose. 118 119 120# 121# Windows/Non-Windows specific settings. Adjust these here, if needed. 122# 123 124if ($ENV{OS} =~ /^Windows/ && !(exists $ENV{OSTYPE} || exists $ENV{HOME})) { 125 $nullDevice = "nul"; 126 $userConfigDir = $ENV{APPDATA}; 127 128 # 129 # On Windows, configuration file is installed in same folder as this script. 130 # 131 $0 =~ m%^(.*[/\\])[^/\\]*?$%; 132 $systemConfigDir = $1; 133 134 $config_tempDir = "$ENV{TEMP}"; 135} else { 136 $nullDevice = "/dev/null"; 137 $userConfigDir = $ENV{HOME}; 138 $systemConfigDir = "/usr/local/etc"; 139 140 $config_tempDir = "/tmp"; 141} 142 143 144# 145# Character conversion tables 146# 147 148# Only (amp, apos, gt, lt and quot) are the required reserved characters in HTML 149# and XHTML, others are used for better text experience. 150my %escChrs = ( amp => '&', apos => '\'', gt => '>', lt => '<', quot => '"', 151 acute => '\'', brvbar => '|', copy => '(C)', divide => '/', 152 laquo => '<<', macr => '-', nbsp => ' ', raquo => '>>', 153 reg => '(R)', shy => '-', times => 'x' 154); 155 156my %splchars = ( 157 "\xC2" => { 158 "\xA0" => ' ', # <nbsp> non-breaking space 159 "\xA2" => 'cent', # <cent> 160 "\xA3" => 'Pound', # <pound> 161 "\xA5" => 'Yen', # <yen> 162 "\xA6" => '|', # <brvbar> broken vertical bar 163# "\xA7" => '', # <sect> section 164 "\xA9" => '(C)', # <copy> copyright 165 "\xAB" => '<<', # <laquo> angle quotation mark (left) 166 "\xAC" => '-', # <not> negation 167 "\xAE" => '(R)', # <reg> registered trademark 168 "\xB1" => '+-', # <plusmn> plus-or-minus 169 "\xB4" => '\'', # <acute> 170 "\xB5" => 'u', # <micro> 171# "\xB6" => '', # <para> paragraph 172 "\xBB" => '>>', # <raquo> angle quotation mark (right) 173 "\xBC" => '(1/4)', # <frac14> fraction 1/4 174 "\xBD" => '(1/2)', # <frac12> fraction 1/2 175 "\xBE" => '(3/4)', # <frac34> fraction 3/4 176 }, 177 178 "\xC3" => { 179 "\x97" => 'x', # <times> multiplication 180 "\xB7" => '/', # <divide> division 181 }, 182 183 "\xCF" => { 184 "\x80" => 'PI', # <pi> 185 }, 186 187 "\xE2\x80" => { 188 "\x82" => ' ', # <ensp> en space 189 "\x83" => ' ', # <emsp> em space 190 "\x85" => ' ', # <qemsp> 191 "\x93" => ' - ', # <ndash> en dash 192 "\x94" => ' -- ', # <mdash> em dash 193 "\x95" => '--', # <horizontal bar> 194 "\x98" => '`', # <soq> 195 "\x99" => '\'', # <scq> 196 "\x9C" => '"', # <doq> 197 "\x9D" => '"', # <dcq> 198 "\xA2" => '::', # <diamond symbol> 199 "\xA6" => '...', # <hellip> horizontal ellipsis 200 "\xB0" => '%.', # <permil> per mille 201 }, 202 203 "\xE2\x82" => { 204 "\xAC" => 'Euro' # <euro> 205 }, 206 207 "\xE2\x84" => { 208 "\x85" => 'c/o', # <care/of> 209 "\x97" => '(P)', # <sound recording copyright> 210 "\xA0" => '(SM)', # <servicemark> 211 "\xA2" => '(TM)', # <trade> trademark 212 "\xA6" => 'Ohm', # <Ohm> 213 }, 214 215 "\xE2\x85" => { 216 "\x93" => '(1/3)', 217 "\x94" => '(2/3)', 218 "\x95" => '(1/5)', 219 "\x96" => '(2/5)', 220 "\x97" => '(3/5)', 221 "\x98" => '(4/5)', 222 "\x99" => '(1/6)', 223 "\x9B" => '(1/8)', 224 "\x9C" => '(3/8)', 225 "\x9D" => '(5/8)', 226 "\x9E" => '(7/8)', 227 "\x9F" => '1/', 228 }, 229 230 "\xE2\x86" => { 231 "\x90" => '<--', # <larr> left arrow 232 "\x92" => '-->', # <rarr> right arrow 233 "\x94" => '<-->', # <harr> left right arrow 234 }, 235 236 "\xE2\x88" => { 237 "\x82" => 'd', # partial differential 238 "\x9E" => 'infinity', 239 }, 240 241 "\xE2\x89" => { 242 "\xA0" => '!=', # <neq> 243 "\xA4" => '<=', # <leq> 244 "\xA5" => '>=', # <geq> 245 }, 246 247 "\xEF\x82" => { 248 "\xB7" => '*' # small white square 249 } 250); 251 252 253# 254# Check argument(s) sanity. 255# 256 257my $usage = <<USAGE; 258 259Usage: $0 [infile.docx|-|-h] [outfile.txt|-] 260 $0 < infile.docx 261 $0 < infile.docx > outfile.txt 262 263 In second usage, output is dumped on STDOUT. 264 265 Use '-h' as the first argument to get this usage information. 266 267 Use '-' as the infile name to read the docx file from STDIN. 268 269 Use '-' as the outfile name to dump the text on STDOUT. 270 Output is saved in infile.txt if second argument is omitted. 271 272Note: infile.docx can also be a directory name holding the unzipped content 273 of concerned .docx file. 274 275USAGE 276 277die $usage if (@ARGV > 2 || $ARGV[0] eq '-h'); 278 279 280# 281# Look for configuration file in current directory/ user configuration 282# directory/ system configuration directory - in the given order. 283# 284 285my %config; 286 287if (-f "docx2txt.config") { 288 %config = do 'docx2txt.config'; 289} elsif (-f "$userConfigDir/docx2txt.config") { 290 %config = do "$userConfigDir/docx2txt.config"; 291} elsif (-f "$systemConfigDir/docx2txt.config") { 292 %config = do "$systemConfigDir/docx2txt.config"; 293} 294 295if (%config) { 296 foreach my $var (keys %config) { 297 $$var = $config{$var}; 298 } 299} 300 301# 302# Check for unzip utility, before proceeding further. 303# 304 305die "Failed to locate unzip command '$config_unzip'!\n" if ! -f $config_unzip; 306 307 308# 309# Handle cases where this script reads docx file from STDIN. 310# 311 312if (@ARGV == 0) { 313 $ARGV[0] = '-'; 314 $ARGV[1] = '-'; 315 $inputFileName = "STDIN"; 316} elsif (@ARGV == 1 && $ARGV[0] eq '-') { 317 $ARGV[1] = '-'; 318 $inputFileName = "STDIN"; 319} else { 320 $inputFileName = $ARGV[0]; 321} 322 323if ($ARGV[0] eq '-') { 324 $tempFile = "${config_tempDir}/dx2tTemp_${$}_" . time() . ".docx"; 325 open my $fhTemp, "> $tempFile" or die "Can't create temporary file for storing docx file read from STDIN!\n"; 326 327 binmode $fhTemp; 328 local $/ = undef; 329 my $docxFileContent = <STDIN>; 330 331 print $fhTemp $docxFileContent; 332 close $fhTemp; 333 334 $ARGV[0] = $tempFile; 335} 336 337 338# 339# Check for existence and readability of required file in specified directory, 340# and whether it is a text file. 341# 342 343sub check_for_required_file_in_folder { 344 stat("$_[1]/$_[0]"); 345 die "Can't read <$_[0]> in <$_[1]>!\n" if ! (-f _ && -r _); 346 die "<$_[1]/$_[0]> does not seem to be a text file!\n" if ! -T _; 347} 348 349sub readFileInto { 350 local $/ = undef; 351 open my $fh, "$_[0]" or die "Couldn't read file <$_[0]>!\n"; 352 binmode $fh; 353 $_[1] = <$fh>; 354 close $fh; 355} 356 357sub readOptionalFileInto { 358 local $/ = undef; 359 360 stat("$_[0]"); 361 if (-f _) { 362 if (-r _ && -T _) { 363 open my $fh, "$_[0]" or die "Couldn't read file <$_[0]>!\n"; 364 binmode $fh; 365 $_[1] = <$fh>; 366 close $fh; 367 } 368 else { 369 die "Invalid <$_[0]>!\n"; 370 } 371 } 372} 373 374 375 376# 377# Check whether first argument is specifying a directory holding extracted 378# content of .docx file, or .docx file itself. 379# 380 381sub cleandie { 382 unlink("$tempFile") if -e "$tempFile"; 383 die "$_[0]"; 384} 385 386 387stat($ARGV[0]); 388 389if (-d _) { 390 check_for_required_file_in_folder("word/document.xml", $ARGV[0]); 391 check_for_required_file_in_folder("word/_rels/document.xml.rels", $ARGV[0]); 392 $inpIsDir = 'y'; 393} 394else { 395 cleandie "Can't read docx file <$inputFileName>!\n" if ! (-f _ && -r _); 396 cleandie "<$inputFileName> does not seem to be a docx file!\n" if -T _; 397} 398 399 400# 401# Extract xml document content from argument docx file/directory. 402# 403 404my $unzip_cmd = "'$config_unzip' $config_unzip_opts"; 405 406if ($inpIsDir eq 'y') { 407 readFileInto("$ARGV[0]/word/document.xml", $content); 408} else { 409 $content = `$unzip_cmd "$ARGV[0]" word/document.xml 2>$nullDevice`; 410} 411 412cleandie "Failed to extract required information from <$inputFileName>!\n" if ! $content; 413 414 415# 416# Be ready for outputting the extracted text contents. 417# 418 419if (@ARGV == 1) { 420 $ARGV[1] = $ARGV[0]; 421 422 # Remove any trailing slashes to generate proper output filename, when 423 # input is directory. 424 $ARGV[1] =~ s%[/\\]+$%% if ($inpIsDir eq 'y'); 425 426 $ARGV[1] .= ".txt" if !($ARGV[1] =~ s/\.docx$/\.txt/); 427} 428 429my $txtfile; 430open($txtfile, "> $ARGV[1]") || cleandie "Can't create <$ARGV[1]> for output!\n"; 431binmode $txtfile; # Ensure no auto-conversion of '\n' to '\r\n' on Windows. 432 433 434# 435# Gather information about header, footer, hyperlinks, images, footnotes etc. 436# 437 438if ($inpIsDir eq 'y') { 439 readFileInto("$ARGV[0]/word/_rels/document.xml.rels", $_); 440} else { 441 $_ = `$unzip_cmd "$ARGV[0]" word/_rels/document.xml.rels 2>$nullDevice`; 442} 443 444my %docurels; 445while (/<Relationship Id="(.*?)" Type=".*?\/([^\/]*?)" Target="(.*?)"( .*?)?\/>/g) 446{ 447 $docurels{"$2:$1"} = $3; 448} 449 450# 451# Gather list numbering information. 452# 453 454$_ = ""; 455if ($inpIsDir eq 'y') { 456 readOptionalFileInto("$ARGV[0]/word/numbering.xml", $_); 457} else { 458 $_ = `$unzip_cmd "$ARGV[0]" word/numbering.xml 2>$nullDevice`; 459} 460 461my %abstractNum; 462my @N2ANId = (); 463 464my %NFList = ( 465 "bullet" => \&bullet, 466 "decimal" => \&decimal, 467 "lowerLetter" => \&lowerLetter, 468 "upperLetter" => \&upperLetter, 469 "lowerRoman" => \&lowerRoman, 470 "upperRoman" => \&upperRoman 471); 472 473if ($_) { 474 while (/<w:abstractNum w:abstractNumId="(\d+)">(.*?)<\/w:abstractNum>/g) 475 { 476 my $abstractNumId = $1, $temp = $2; 477 478 while ($temp =~ /<w:lvl w:ilvl="(\d+)"[^>]*><w:start w:val="(\d+)"[^>]*><w:numFmt w:val="(.*?)"[^>]*>.*?<w:lvlText w:val="(.*?)"[^>]*>.*?<w:ind w:left="(\d+)" w:hanging="(\d+)"[^>]*>/g ) 479 { 480 # $2: Start $3: NumFmt, $4: LvlText, ($5,$6): (Indent (twips), hanging) 481 482 @{$abstractNum{"$abstractNumId:$1"}} = ( 483 $NFList{$3}, 484 $4, 485 $2, 486 int ((($5-$6) / $config_twipsPerChar) + 0.5), 487 $5 488 ); 489 } 490 } 491 492 while ( /<w:num w:numId="(\d+)"><w:abstractNumId w:val="(\d+)"/g ) 493 { 494 $N2ANId[$1] = $2; 495 } 496} 497 498# Remove the temporary file (if) created to store input from STDIN. All the 499# (needed) data is read from it already. 500unlink("$tempFile") if -e "$tempFile"; 501 502 503# 504# Subroutines for center and right justification of text in a line. 505# 506 507sub justify { 508 my $len = length $_[1]; 509 510 if ($_[0] eq "center" && $len < ($config_lineWidth - 1)) { 511 return ' ' x (($config_lineWidth - $len) / 2) . $_[1]; 512 } elsif ($_[0] eq "right" && $len < $config_lineWidth) { 513 return ' ' x ($config_lineWidth - $len) . $_[1]; 514 } else { 515 return $_[1]; 516 } 517} 518 519# 520# Subroutines for dealing with embedded links and images 521# 522 523sub hyperlink { 524 my $hlrid = $_[0]; 525 my $hltext = $_[1]; 526 my $hlink = $docurels{"hyperlink:$hlrid"}; 527 528 $hltext =~ s/<[^>]*?>//og; 529 $hltext .= " [HYPERLINK: $hlink]" if (lc $config_showHyperLink eq "y" && $hltext ne $hlink); 530 531 return $hltext; 532} 533 534# 535# Subroutines for processing numbering information. 536# 537 538my @RomanNumbers = ( "", 539 "i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii", 540 "xiii", "xiv", "xv", "xvi", "xvii", "xviii", "xix", "xx", "xxi", "xxii", 541 "xxiii", "xxiv", "xxv", "xxvi", "xxvii", "xxviii", "xxix", "xxx", "xxxi", 542 "xxxii", "xxxiii", "xxxiv", "xxxv", "xxxvi", "xxxvii", "xxxviii", "xxxix", 543 "xl", "xli", "xlii", "xliii", "xliv", "xlv", "xlvi", "xlvii", "xlviii", 544 "xlix", "l", "li" ); 545 546 547sub lowerRoman { 548 return $RomanNumbers[$_[0]] if ($_[0] < @RomanNumbers); 549 550 @rcode = ("i", "iv", "v", "ix", "x", "xl", "l", "xc", "c", "cd", "d", "cm", "m"); 551 @dval = (1, 4, 5, 9, 10, 40, 50, 90, 100, 400, 500, 900, 1000); 552 553 my $roman = ""; 554 my $num = $_[0]; 555 556 my $div, $i = (@rcode - 1); 557 while ($num > 0) { 558 $i-- while ($num < $dval[$i]); 559 $div = $num / $dval[$i]; 560 $num = $num % $dval[$i]; 561 $roman .= $rcode[$i] x $div; 562 } 563 564 return $roman; 565} 566 567sub upperRoman { 568 return uc lowerRoman(@_); 569} 570 571 572sub lowerLetter { 573 @Alphabets = split '' , "abcdefghijklmnopqrstuvwxyz"; 574 return $Alphabets[($_[0] % 26) - 1] x (($_[0] - 1)/26 + 1); 575} 576 577sub upperLetter { 578 return uc lowerLetter(@_); 579} 580 581 582sub decimal { 583 return $_[0]; 584} 585 586 587my %bullets = ( 588 "\x6F" => 'o', 589 "\xEF\x81\xB6" => '::', # Diamond 590 "\xEF\x82\xA7" => '#', # Small Black Square 591 "\xEF\x82\xB7" => '*', # Small Black Circle 592 "\xEF\x83\x98" => '>', # Arrowhead 593 "\xEF\x83\xBC" => '+' # Right Sign 594); 595 596sub bullet { 597 return $bullets{$_[0]} ? $bullets{$_[0]} : 'oo'; 598} 599 600my @lastCnt = (0); 601my @twipStack = (0); 602my @keyStack = (undef); 603my $ssiz = 1; 604 605sub listNumbering { 606 my $aref = \@{$abstractNum{"$N2ANId[$_[0]]:$_[1]"}}; 607 my $lvlText; 608 609 if ($aref->[0] != \&bullet) { 610 my $key = "$N2ANId[$_[0]]:$_[1]"; 611 my $ccnt; 612 613 if ($aref->[4] < $twipStack[$ssiz-1]) { 614 while ($twipStack[$ssiz-1] > $aref->[4]) { 615 pop @twipStack; 616 pop @keyStack; 617 pop @lastCnt; 618 $ssiz--; 619 } 620 } 621 622 if ($aref->[4] == $twipStack[$ssiz-1]) { 623 if ($key eq $keyStack[$ssiz-1]) { 624 ++$lastCnt[$ssiz-1]; 625 } 626 else { 627 $keyStack[$ssiz-1] = $key; 628 $lastCnt[$ssiz-1] = $aref->[2]; 629 } 630 } 631 elsif ($aref->[4] > $twipStack[$ssiz-1]) { 632 push @twipStack, $aref->[4]; 633 push @keyStack, $key; 634 push @lastCnt, $aref->[2]; 635 $ssiz++; 636 } 637 638 $ccnt = $lastCnt[$ssiz-1]; 639 640 $lvlText = $aref->[1]; 641 $lvlText =~ s/%\d([^%]*)$/($aref->[0]->($ccnt)).$1/oe; 642 643 my $i = $ssiz - 2; 644 $i-- while ($lvlText =~ s/%\d([^%]*)$/$lastCnt[$i]$1/o); 645 } 646 else { 647 $lvlText = $aref->[0]->($aref->[1]); 648 } 649 650 return ' ' x $aref->[3] . $lvlText . ' '; 651} 652 653# 654# Subroutines for processing paragraph content. 655# 656 657sub processParagraph { 658 my $para = $_[0] . "$config_newLine"; 659 my $align = $1 if ($_[0] =~ /<w:jc w:val="([^"]*?)"\/>/); 660 661 $para =~ s/<.*?>//og; 662 return justify($align,$para) if $align; 663 664 return $para; 665} 666 667# 668# Text extraction starts. 669# 670 671my %tag2chr = (tab => "\t", noBreakHyphen => "-", softHyphen => " - "); 672 673$content =~ s/<?xml .*?\?>(\r)?\n//; 674 675$content =~ s{<(wp14|wp):[^>]*>.*?</\1:[^>]*>}||og; 676 677# Remove the field instructions (instrText) and data (fldData), and deleted 678# text. 679$content =~ s{<w:(instrText|fldData|delText)[^>]*>.*?</w:\1>}||ogs; 680 681# Mark cross-reference superscripting within [...]. 682$content =~ s|<w:vertAlign w:val="superscript"/></w:rPr><w:t>(.*?)</w:t>|[$1]|og; 683 684$content =~ s{<w:(tab|noBreakHyphen|softHyphen)/>}|$tag2chr{$1}|og; 685 686my $hr = '-' x $config_lineWidth . $config_newLine; 687$content =~ s|<w:pBdr>.*?</w:pBdr>|$hr|og; 688 689$content =~ s{<w:caps/>.*?(<w:t>|<w:t [^>]+>)(.*?)</w:t>}/uc $2/oge; 690 691$content =~ s{<w:hyperlink r:id="(.*?)".*?>(.*?)</w:hyperlink>}/hyperlink($1,$2)/oge; 692 693$content =~ s|<w:numPr><w:ilvl w:val="(\d+)"/><w:numId w:val="(\d+)"\/>|listNumbering($2,$1)|oge; 694 695$content =~ s{<w:ind w:(left|firstLine)="(\d+)"( w:hanging="(\d+)")?[^>]*>}|' ' x int((($2-$4)/$config_twipsPerChar)+0.5)|oge; 696 697$content =~ s{<w:p [^/>]+?/>|<w:br/>}|$config_newLine|og; 698 699$content =~ s/<w:p[^>]+?>(.*?)<\/w:p>/processParagraph($1)/ogse; 700 701$content =~ s/<.*?>//og; 702 703 704# 705# Convert non-ASCII characters/character sequences to ASCII characters. 706# 707 708$content =~ s/(\xC2|\xC3|\xCF|\xE2.|\xEF.)(.)/($splchars{$1}{$2} ? $splchars{$1}{$2} : $1.$2)/oge; 709 710# 711# Convert docx specific (reserved HTML/XHTML) escape characters. 712# 713$content =~ s/(&)(amp|apos|gt|lt|quot)(;)/$escChrs{lc $2}/iog; 714 715# 716# Write the extracted and converted text contents to output. 717# 718 719print $txtfile $content; 720close $txtfile; 721 722