1#!/usr/local/bin/perl 2 3# 4# Sample external converter for htdig 3.1.4 or later. 5# Usage: (in htdig.conf) 6# 7# external_parsers: application/msword->text/html /usr/local/bin/conv_doc.pl \ 8# application/postscript->text/html /usr/local/bin/conv_doc.pl \ 9# application/pdf->text/html /usr/local/bin/conv_doc.pl 10# 11# Written by Gilles Detillieux <grdetil@scrc.umanitoba.ca>. 12# Based in part on the parse_word_doc.pl script, written by 13# Jesse op den Brouw <MSQL_User@st.hhs.nl> but heavily revised. 14# 15# 1998/12/11 16# Added: catdoc test (is catdoc runnable?) <carl@dpiwe.tas.gov.au> 17# 1999/02/09 18# Added: uses ps2ascii to handle PS files <grdetil@scrc.umanitoba.ca> 19# 1999/02/15 20# Added: check for some file formats <Frank.Richter@hrz.tu-chemnitz.de> 21# 1999/02/25 22# Added: uses pdftotext to handle PDF files <grdetil@scrc.umanitoba.ca> 23# 1999/03/01 24# Added: extra checks for file "wrappers" <grdetil@scrc.umanitoba.ca> 25# & check for MS Word signature (no longer defaults to catdoc) 26# 1999/03/05 27# Changed: rejoin hyphenated words across lines <grdetil@scrc.umanitoba.ca> 28# (in PDFs) 29# 1999/08/12 30# Changed: adapted for xpdf 0.90 release <grdetil@scrc.umanitoba.ca> 31# Added: uses pdfinfo to handle PDF titles <grdetil@scrc.umanitoba.ca> 32# Changed: change dashes to hyphens <grdetil@scrc.umanitoba.ca> 33# 1999/09/09 34# Changed: fix to handle empty PDF title right <grdetil@scrc.umanitoba.ca> 35# 1999/12/01 36# Changed: rewritten as external converter <grdetil@scrc.umanitoba.ca> 37# stripped out all parser-related code 38# Added: test to silently ignore wrapped EPS files < " > 39# Added: test for null device on Win32 env. <PBISSET@emergency.qld.gov.au> 40# 2000/01/12 41# Changed: "break" to "last" (no break in Perl) <wjones@tc.fluke.com> 42# 2001/07/12 43# Changed: fix "last" handling in dehyphenation <grdetil@scrc.umanitoba.ca> 44# Added: handle %xx codes in title from URL <grdetil@scrc.umanitoba.ca> 45######################################### 46# 47# set this to your MS Word to text converter 48# get it from: http://www.fe.msk.ru/~vitus/catdoc/ 49# 50$CATDOC = "/usr/local/bin/catdoc"; 51# 52# set this to your WordPerfect to text converter, or /bin/true if none available 53# this nabs WP documents with .doc suffix, so catdoc doesn't see them 54# 55$CATWP = "/bin/true"; 56# 57# set this to your RTF to text converter, or /bin/true if none available 58# this nabs RTF documents with .doc suffix, so catdoc doesn't see them 59# 60$CATRTF = "/bin/true"; 61# 62# set this to your PostScript to text converter 63# get it from the ghostscript 3.33 (or later) package 64# 65$CATPS = "/usr/bin/ps2ascii"; 66# 67# set this to your PDF to text converter, and pdfinfo tool 68# get it from the xpdf 0.90 package at http://www.foolabs.com/xpdf/ 69# 70$CATPDF = "/usr/bin/pdftotext"; 71$PDFINFO = "/usr/bin/pdfinfo"; 72#$CATPDF = "/usr/local/bin/pdftotext"; 73#$PDFINFO = "/usr/local/bin/pdfinfo"; 74 75######################################### 76# 77# need some var's 78$dehyphenate = 0; # set if we must dehyphenate text output 79$ishtml = 0; # set if converter produces HTML 80$null = ""; 81$magic = ""; 82$type = ""; 83$cvtr = ""; 84$cvtcmd = ""; 85$title = ""; 86@parts = (); 87 88# make portable to win32 platform or unix 89$null = "/dev/null"; 90if ($^O eq "MSWin32") {$null = "nul";} 91 92 93######################################### 94# 95# Read first bytes of file to check for file type (like file(1) does) 96open(FILE, "< $ARGV[0]") || die "Can't open file $ARGV[0]: $!\n"; 97read FILE,$magic,8; 98close FILE; 99 100if ($magic =~ /^\0\n/) { # possible MacBinary header 101 open(FILE, "< $ARGV[0]") || die "Can't open file $ARGV[0]: $!\n"; 102 read FILE,$magic,136; # let's hope converters can handle them! 103 close FILE; 104} 105 106if ($magic =~ /%!|^\033%-12345/) { # it's PostScript (or HP print job) 107 $cvtr = $CATPS; # gs 3.33 leaves _temp_.??? files in . 108# keep quiet even if PS gives errors... 109 $cvtcmd = "(cd /tmp; $cvtr; rm -f _temp_.???) < $ARGV[0] 2>$null"; 110# allow PS interpreter to give error messages... 111# $cvtcmd = "(cd /tmp; $cvtr; rm -f _temp_.???) < $ARGV[0]"; 112 $type = "PostScript"; 113 $dehyphenate = 0; # ps2ascii already does this 114 if ($magic =~ /^\033%-12345/) { # HP print job 115 open(FILE, "< $ARGV[0]") || die "Can't open file $ARGV[0]: $!\n"; 116 read FILE,$magic,256; 117 close FILE; 118 exit unless $magic =~ /^\033%-12345X\@PJL.*\n*.*\n*.*ENTER\s*LANGUAGE\s*=\s*POSTSCRIPT.*\n*.*\n*.*\n%!/ 119 } 120} elsif ($magic =~ /\305\320\323\306\036/) { # it's a wrapped EPS - ignore 121 exit 122} elsif ($magic =~ /%PDF-/) { # it's PDF (Acrobat) 123 $cvtr = $CATPDF; 124 $cvtcmd = "$cvtr -raw $ARGV[0] -"; 125# to handle single-column, strangely laid out PDFs, use coalescing feature... 126# $cvtcmd = "$cvtr $ARGV[0] -"; 127 $type = "PDF"; 128 $dehyphenate = 1; # PDFs often have hyphenated lines 129 if (open(INFO, "$PDFINFO $ARGV[0] 2>$null |")) { 130 while (<INFO>) { 131 if (/^Title:/) { 132 s/^Title:\s+//; 133 s/\s+$//; 134 s/\s+/ /g; 135 s/&/\&\;/g; 136 s/</\<\;/g; 137 s/>/\>\;/g; 138 $title = $_; 139 last; 140 } 141 } 142 close INFO; 143 } 144# to use coalescing feature conditionally... 145# if ($title =~ /...Title of Corel DRAW output.../) { 146# $cvtcmd = "$cvtr $ARGV[0] -"; 147# } 148} elsif ($magic =~ /WPC/) { # it's WordPerfect 149 $cvtr = $CATWP; 150 $cvtcmd = "$cvtr $ARGV[0]"; 151 $type = "WordPerfect"; 152 $dehyphenate = 0; # WP documents not likely hyphenated 153} elsif ($magic =~ /^{\\rtf/) { # it's Richtext 154 $cvtr = $CATRTF; 155 $cvtcmd = "$cvtr $ARGV[0]"; 156 $type = "RTF"; 157 $dehyphenate = 0; # RTF documents not likely hyphenated 158} elsif ($magic =~ /\320\317\021\340/) { # it's MS Word 159 $cvtr = $CATDOC; 160 $cvtcmd = "$cvtr -a -w $ARGV[0]"; 161 $type = "Word"; 162 $dehyphenate = 0; # Word documents not likely hyphenated 163} else { 164 die "Can't determine type of file $ARGV[0]; content-type: $ARGV[1]; URL: $ARGV[2]\n"; 165} 166 167die "$cvtr is absent or unwilling to execute.\n" unless -x $cvtr; 168 169############################################# 170# 171# Start output. 172 173# if running as a converter for "user-defined" output type... 174#print "Content-Type: text/html\n\n"; 175 176if ($ishtml) { 177 # converter will give its own HTML output 178 system("$cvtcmd") || die "$cvtr doesn't want to be run from shell.\n"; 179 exit; 180} 181 182# Produce HTML output from converter's text output, so we can add title. 183print "<HTML>\n<head>\n"; 184 185# print out the title, if it's set, and not just a file name, or make one up 186if ($title eq "" || $title =~ /^[A-G]:[^\s]+\.[Pp][Dd][Ff]$/) { 187 @parts = split(/\//, $ARGV[2]); # get the file basename 188 $parts[-1] =~ s/%([A-F0-9][A-F0-9])/pack("C", hex($1))/gie; 189 $title = "$type Document $parts[-1]"; # use it in title 190} 191print "<title>$title</title>\n"; 192 193print "</head>\n<body>\n"; 194 195# Open file via selected converter, output its text. 196open(CAT, "$cvtcmd |") || die "$cvtr doesn't want to be opened using pipe.\n"; 197while (<CAT>) { 198 while (/[A-Za-z\300-\377]-\s*$/ && $dehyphenate) { 199 $_ .= <CAT>; 200 last if eof; 201 s/([A-Za-z\300-\377])-\s*\n\s*([A-Za-z\300-\377])/$1$2/s 202 } 203 s/[\255]/-/g; # replace dashes with hyphens 204 s/\f/\n/g; # replace form feed 205 s/&/\&\;/g; # HTMLify text 206 s/</\<\;/g; 207 s/>/\>\;/g; 208 print; 209} 210 211print "</body>\n</HTML>\n"; 212 213close CAT; 214 215