1#!/usr/bin/perl 2# 3# Copyright (C) 2007 Geoffrey M. Voelker 4# Copyright (c) 2016-2018 Eddie Kohler; see LICENSE. 5# 6# banal -- analyze pdf formatting 7# 8# This program is free software; you can redistribute it and/or modify 9# it under the terms of the GNU General Public License as published by 10# the Free Software Foundation; either version 2 of the License, or 11# (at your option) any later version. 12# 13# This program is distributed in the hope that it will be useful, 14# but WITHOUT ANY WARRANTY; without even the implied warranty of 15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16# GNU General Public License for more details. 17# 18# You should have received a copy of the GNU General Public License 19# along with this program. If not, see <http://www.gnu.org/licenses/>. 20# 21# Geoffrey M. Voelker (voelker@cs.ucsd.edu) 22# 23 24# todo: 25# -- computer modern roman fonts 26# -- embedded java script, remoteapproach.com 27 28use Data::Dumper; 29use File::Basename; 30use File::Temp; 31use POSIX; 32use List::Util qw(min max); 33my($FILE, $banal_text_fudge); 34 35sub usage { 36 print <<EOF; 37usage: banal [-report | -stats | -judge [specs]] [-zoom=N] files 38 39banal has three modes of operation: 40 41-report print full formatting info for all pages. this mode is 42 the default if no mode is specified: 43 44 % banal paper.pdf 45 46-stats print formatting info condensed into one line with fields 47 separated by tabs; useful for computing summary stats across 48 many papers. 49 50 fields are 'file', 'paper', 'text region', 'margins', 'font', 51 'leading', 'columns', 'pages', 'app'. for example: 52 53 % banal -stats *.pdf | cut -f 5 54 55 extracts font sizes from a set of pdf files. 56 57-judge compare document formatting against a set of formatting 58 specifications: 59 60 -paper=type paper type ('letter' and 'A4' currently supported) 61 -pages=num max number of pages 62 -font=num min font size 63 -leading=num min leading 64 -cols=num max columns 65 -width=inches max text region width 66 -height=inches max text region height 67 -fudge=inches text region fudge factor (helps with latex 68 overflow; default is $banal_text_fudge inches) 69 70 specifications can consist of any and all elements in any 71 combination. for example: 72 73 % banal -judge -paper=letter -pages=14 -font=10 -leading=12 -width=6.5 -height=9 *.pdf 74 75 will check whether a set of pdf files conforms to formatting specs 76 that require 8.5" x 11" paper, max 14 pages, min 10 point font, 77 min 12 point leading, and a max text region of 6.5" x 9". 78 79-format=lines|list 80 81 lines report format violations on multiple lines (default) 82 83 list report format violations on a single line separated by a 84 comma (e.g., for importing into a spreadsheet). 85 86 % banal -judge -format=list [specs] *.pdf 87 88 - - - - - - - - - - - - - - - - - - 89 90-zoom=N passed to pdftohtml. 91 92-no_app do not calculate application 93 94-json JSON output 95 96-version report the version of banal 97 98EOF 99 exit(1); 100} 101 102# version 103$banal_version = 1.2; 104 105# parse args 106local($report, $stats, $judge, $no_app, $json, $version, $debug_pdftohtml, 107 $paper, $pages, $font, $leading, $cols, $width, $height, $fudge, $format, $zoom); 108for (my $i = 0; $i < @ARGV; ) { 109 no strict "refs"; 110 if ($ARGV[$i] =~ /\A--?(report|stats|judge|no[-_]app|json|version|debug[-_]pdftohtml)\z/) { 111 my($name) = $1; 112 $name =~ s/-/_/g; 113 ${$name} = 1; 114 splice @ARGV, $i, 1; 115 } elsif ($ARGV[$i] =~ /\A--?(paper|pages|font|leading|cols|width|height|fudge|format|zoom)=(.*)\z/) { 116 ${$1} = $2; 117 splice @ARGV, $i, 1; 118 } elsif ($ARGV[$i] =~ /\A--?(paper|pages|font|leading|cols|width|height|fudge|format|zoom)\z/ && $i + 1 < @ARGV) { 119 ${$1} = $ARGV[$i + 1]; 120 splice @ARGV, $i, 2; 121 } elsif ($ARGV[$i] =~ /\A-/) { 122 print STDERR "banal: bad option ", $ARGV[$i], "\n"; 123 usage; 124 } else { 125 $i += 1; 126 } 127} 128 129my(@switches); 130push @switches, "-zoom=$zoom" if defined $zoom; 131 132# zoom value 133if ((defined $zoom) && ($zoom !~ /^[1-9]\d*(\.\d*)?$/)) { 134 print STDERR "banal: bad -zoom\n"; 135 usage; 136} 137 138# mapping from pdftohtml units to inches 139#$p2h_per_inch = 72; 140my $p2h_per_inch; 141 142# scale factor from pdftohtml units to points 143#$p2h_to_points = 72 / $p2h_per_inch; 144my $p2h_to_points; 145 146# minimum amount of text on page for it to be interesting 147my $banal_min_density = 8000; 148 149# fudge factor when judging text regions (in inches). 150$banal_text_fudge = 0.05; 151 152# minimum number of pages that have to fail the text region specs. 153# often papers have 1-2 pages where text on a table or figure extends 154# into the margin. when judging an entire paper, we'll let those slide... 155my $banal_judge_min_fail_pages = 3; 156 157# policy to use to estimate leading 158my $banal_leading_policy; 159 160# round margins and text blocks to this number of points 161my $grid = 4; 162 163# pdftohtml executable 164my $pdftohtml; 165if (exists $ENV{"PDFTOHTML"}) { 166 $pdftohtml = $ENV{"PDFTOHTML"}; 167} elsif (exists $ENV{"PHP_PDFTOHTML"}) { 168 $pdftohtml = $ENV{"PHP_PDFTOHTML"}; 169} elsif (defined $pdftohtml_prog) { 170 $pdftohtml = $pdftohtml_prog; 171} else { 172 $pdftohtml = "pdf-to-html"; 173} 174 175#print STDERR "using $pdftohtml...\n"; 176 177# version of pdftohtml program 178my $p2h_version = 0; 179 180# full path of file being analyzed 181my $banal_fullpath = ''; 182# file name of file being analyzed 183my $banal_filename = ''; 184 185my $use_raw_leading; 186my $title = ''; 187 188# return min key in hash 189sub minkey ($) { 190 my ($href) = @_; 191 return (sort { $a <=> $b } keys %$href)[0]; 192} 193 194# return max key in hash 195sub maxkey ($) { 196 my ($href) = @_; 197 return (sort { $a <=> $b } keys %$href)[$#_ - 1]; 198} 199 200# return key of mode of values in hash 201sub modevalkey ($) { 202 my ($href) = @_; 203 my ($mode) = (keys %$href)[0]; 204 map { $mode = $_ if ($href->{$_} > $href->{$mode}) } keys %$href; 205 return $mode; 206} 207 208# return max val in hash 209sub maxval ($) { 210 my ($href) = @_; 211 my ($max) = (keys %$href)[0]; 212 map { $max = $_ if ($href->{$_} > $href->{$max}) } keys %$href; 213 return $href->{$max}; 214} 215 216# return 'a' == 'b' 217sub bb_equal ($$) { 218 my ($a, $b) = @_; 219 return (($a->{top} == $b->{top}) && 220 ($a->{left} == $b->{left}) && 221 ($a->{height} == $b->{height}) && 222 ($a->{width} == $b->{width})); 223} 224 225# merge 'a' into 'b' 226sub bb_merge ($$) { 227 my ($a, $b) = @_; 228 229 $b->{top} = min $a->{top}, $b->{top}; 230 $b->{left} = min $a->{left}, $b->{left}; 231 $b->{height} = max $a->{height}, $b->{height}; 232 $b->{width} = max $a->{width}, $b->{width}; 233} 234 235sub calc_page_body_font ($) { 236 my ($page) = @_; 237 my ($mode) = modevalkey ($page->{pagedata}->{segdata}->{byfont}); 238 $page->{pagedata}->{bodyfont} = $page->{doc}->{fonts}->{$mode}; 239 $page->{pagespec}->{bodyfont} = p2h_font_to_font_size ($page->{pagedata}->{bodyfont}); 240 if ($page->{pagespec}->{bodyfont} == 0) { 241 print STDERR "$banal_filename: Error: Zero font on page $page->{num}, font id $mode\n"; 242 } 243} 244 245sub utf8ascii_undo ($) { 246 my ($str) = @_; 247 248 return $str unless ($str =~ /^\\376\\377(\\\d\d\d.)*$/); 249 250 # string is UTF-8 in ASCII (not binary) 251 # (PDFCreator seems to like to do this, also freepdfconvert) 252 print "$banal_filename: ascii UTF-8: $title\n" if ($debug_docapp); 253 254 $str =~ s/\\376\\377//; 255 $str =~ s/\\000//g; 256 257 print "$banal_filename: unencoded: $str\n" if ($debug_docapp); 258 return $str; 259} 260 261sub utf8bin_undo ($) { 262 my ($str) = @_; 263 264 return $str unless ($str =~ /^\376\377(\000.)*$/); 265 266 # string is UTF-8 in binary 267 print "$banal_filename: bin UTF-8: $str\n" if ($debug_docapp); 268 269 $str =~ s/\376\377//; 270 $str =~ s/\000//g; 271 272 print "$banal_filename: unencoded $str\n" if ($debug_docapp); 273 return $str; 274} 275 276sub utf8revbin_undo ($) { 277 my ($str) = @_; 278 279 # bytes reversed: character then null bytes (ScanSoft on the Mac) 280 281 return $str unless ($str =~ /^\377\376(.\000)*$/); 282 283 # string is UTF-8 in binary 284 print "$banal_filename: rev bin UTF-8: $str\n" if ($debug_docapp); 285 286 $str =~ s/\377\376//; 287 $str =~ s/\000//g; 288 289 print "$banal_filename: unencoded $str\n" if ($debug_docapp); 290 return $str; 291} 292 293sub utf8hex_undo ($) { 294 my ($str) = @_; 295 296 return $str unless ($str =~ /^FEFF(00..)*$/i); 297 298 print "$banal_filename: hex UTF-8: $str\n" if ($debug_docapp); 299 300 $str =~ s/^FEFF//i; 301 $str =~ s/00//g; 302 print "$banal_filename: hex ascii: $str\n" if ($debug_docapp); 303 $str = pack ("H*", $str); 304 305 print "$banal_filename: packed $str\n" if ($debug_docapp); 306 return $str; 307} 308 309# inferring the document application has two steps: 310# 1) extracting the doc metadata 311# 2) mapping metadata info to an application 312# 313# for (1), ideally we could use a module or tool to extract the 314# InfoDict from the end of the pdf file. but there are some cases 315# where we need to peek outside the InfoDict for additional hints, so 316# in the end we still have to scan through the pdf file ourselves. 317# 318# for (2), the world would be a simpler place if applications followed 319# some kind of convention. but given the large combination of apps, 320# pdf converters, and OSes, of course the world is not so simple. so, 321# as usual, it's back to heuristics gathered from samples... 322 323sub calc_doc_app ($) { 324 my ($doc) = @_; 325 my ($fname) = $doc->{fullpath}; 326 327 my ($creator, $producer, $creatortool, $ptex); 328 my ($rdftitle, $pdfproducer); 329 my ($indirect, $quartzpdf, $pdfmachine, $cmrfont, $texfont); 330 331 $creator = $title = $producer = $creatortool = $ptex = ''; 332 $rdftitle = $pdfproducer = ''; 333 $indirect = $quartzpdf = $pdfmachine = $cmrfont = $texfont = 0; 334 335 my ($app, @allapps); 336 $app = ''; 337 @allapps = (); 338 339 if (!open (PDF, $fname)) { 340 print STDERR "$banal_filename: Error: Failed to open $fname for inferring doc app."; 341 $doc->{app} = 'unknown'; 342 return; 343 } 344 345 while (<PDF>) { 346 347 if (m|\/Creator\s*\(([^\)]+)\)|) { 348 $creator = $1; 349 } elsif (m|\/Creator\s*<([^\)]+)>|) { 350 # UTF-8 ascii hex 351 $creator = utf8hex_undo ($1); 352 } elsif (m|\/Creator \d+ \d+ R|) { 353 # Indirection: 354 # << /Producer 313 0 R /Creator 314 0 R ... 355 $indirect = 1; 356 } 357 358 if (m|\/Title\s*\(([^\)]+)\)|) { 359 $title = $1; 360 } elsif (m|\/Title\s*<([^\)]+)>|) { 361 # UTF-8 ascii hex 362 $title = utf8hex_undo ($1); 363 } elsif (m|<dc:title>.+<rdf:li.+>(.+)</rdf:li>.+</dc:title>|) { 364 $rdftitle = $1; 365 } elsif (m|<dc:title>|) { 366 unless (m|</dc:title>|) { 367 while (<PDF>) { 368 last if (m|</dc:title>|); 369 next unless (m|<rdf:li.+>(.+)</rdf:li>|); 370 $rdftitle = $1; 371 } 372 } 373 } 374 375 if (m|\/Producer\s*\(([^\)]+)\)|) { 376 $producer = $1; 377 } elsif (m|\/Producer\s*<([^\)]+)>|) { 378 # UTF-8 ascii hex 379 $producer = utf8hex_undo ($1); 380 } elsif (m|<pdf:Producer>(.+)</pdf:Producer>|) { 381 $pdfproducer = $1; 382 } 383 384 # xap: Adobe Extensible Authoring and Publishing (early name, 5.0) 385 # xmp: Adobe Extensible Metadata Platform (final name) 386 if (m|<x[am]p:CreatorTool>(.+)<\/x[am]p:CreatorTool>|) { 387 $creatortool = $1; 388 } 389 390 if (m|<pdfx:PTEX|) { 391 # <pdfx:PTEX.Fullbanner>This is pdfTeX...</pdfx:PTEX.Fullbanner> 392 $ptex = 1; 393 } 394 395 if (m|\(Mac OS.+Quartz PDFContext\)|) { 396 # (Mac OS X 10.6.2 Quartz PDFContext) [producer indirection] 397 $quartzpdf = 1; 398 } elsif (m|\(TeX\)|) { 399 # (TeX) [creator indirection] 400 $tex = 1; 401 } elsif (m|% created by pdfMachine|) { 402 # tool doesn't bother to create any metadata whatsoever... 403 $pdfmachine = 1; 404 } 405 406 if (!$cmrfont && m|(\/BaseFont\s*\/\w+\+[Cc][Mm][Rr]\d+)|) { 407 # /BaseFont/EGYAWT+CMR8 408 $pdf_tools{'cmr fonts'}++; 409 $cmrfont = 1; 410 } elsif (!$texfont && m|/BaseFont\s*/\w+\+([Cc][Mm]\w\w\d+)|) { 411 $pdf_tools{'tex fonts'}++; 412 $texfont = $1; 413 } 414 415 } 416 417 close (PDF); 418 419 # undo any UTF-8 in ascii (literally "\376\377\000P\000r\000o...") 420 $title = utf8ascii_undo ($title) if ($title); 421 $creator = utf8ascii_undo ($creator) if ($creator); 422 $producer = utf8ascii_undo ($producer) if ($producer); 423 $creatortool = utf8ascii_undo ($creatortool) if ($creatortool); 424 $rdftitle = utf8ascii_undo ($rdftitle) if ($rdftitle); 425 $pdfproducer = utf8ascii_undo ($pdfproducer) if ($pdfproducer); 426 427 # undo any UTF-8 in binary 428 $title = utf8bin_undo ($title) if ($title); 429 $creator = utf8bin_undo ($creator) if ($creator); 430 $producer = utf8bin_undo ($producer) if ($producer); 431 $creatortool = utf8bin_undo ($creatortool) if ($creatortool); 432 $rdftitle = utf8bin_undo ($rdftitle) if ($rdftitle); 433 $pdfproducer = utf8bin_undo ($pdfproducer) if ($pdfproducer); 434 435 # undo any UTF-8 in binary (reversed) 436 $title = utf8revbin_undo ($title) if ($title); 437 $creator = utf8revbin_undo ($creator) if ($creator); 438 $producer = utf8revbin_undo ($producer) if ($producer); 439 $creatortool = utf8revbin_undo ($creatortool) if ($creatortool); 440 $rdftitle = utf8revbin_undo ($rdftitle) if ($rdftitle); 441 $pdfproducer = utf8revbin_undo ($pdfproducer) if ($pdfproducer); 442 443 $title = $rdftitle if (!$title && $rdftitle); 444 445 # Word 446 if ($creator =~ /Microsoft.+Word/) { 447 # Mac OS Quartz PDFContext, doPDF 448 $pdf_tools{'word in creator'}++; 449 $app = 'word'; 450 } elsif ($title =~ /Microsoft Word \-/) { 451 # ps->pdf w/ gs, distiller 452 # often doc name in title after '-' (but not always) 453 $pdf_tools{'gs, distiller'}++; 454 $app = 'word'; 455 } elsif ($title =~ /Proceedings Template \- WORD/i) { 456 $pdf_tools{'template'}++; 457 $app = 'word'; 458 } elsif ($creator =~ /easyPDF/) { 459 # BCL easyPDF 460 $pdf_tools{'easyPDF'}++; 461 $app = 'word'; 462 } elsif ($creator =~ /PDFCreator/) { 463 $pdf_tools{'PDFCreator'}++; 464 $app = 'word'; 465 } elsif ($creator =~ /PDFMaker.+Word/) { 466 $pdf_tools{'PDFMaker'}++; 467 $app = 'word'; 468 } elsif ($creator =~ /Sonic PDF/) { 469 $pdf_tools{'sonic pdf'}++; 470 $app = 'word'; 471 } elsif ($creatortool =~ /Word/) { 472 # Adobe XMP metadata 473 $pdf_tools{'Acrobat PDFMaker'}++; 474 $app = 'word'; 475 } elsif ($producer =~ /freepdfconvert|deskPDF|ReportLab|PDF reDirect/) { 476 $pdf_tools{'misc pdf tools'}++; 477 $app = 'word'; 478# } elsif ($creator =~ /\000M\000i\000c\000r\000o\000s\000o\000f\000t.+\000W\000o\000r\000d/i) { 479 # UTF-8 binary 480# $pdf_tools{'Word (UTF-8)'}++; 481# $app = 'word'; 482 } elsif ($pdfmachine) { 483 $pdf_tools{'pdfmachine'}++; 484 $app = 'word'; 485 } elsif ($title =~ /\.docx?$/i) { 486 # Amyuni puts the filename in the title 487 $pdf_tools{'doc(x) extension'}++; 488 $app = 'word'; 489 } 490 491 if ($app) { 492 push (@allapps, $app); 493 $app = ''; 494 495 # never seen this happen, but let's sanity check... 496 if ($cmrfont) { 497 print STDERR "$banal_filename: Warning: CMR font in Word doc?\n"; 498 $pdf_tools{'** cmrfont in word doc'}++; 499 } 500 } 501 502 # TeX 503 if ($creator =~ /TeX/) { 504 $pdf_tools{'tex in creator'}++; 505 $app = 'tex'; 506 } elsif ($creatortool =~ /(MiK)?TeX/) { 507 $pdf_tools{'(mik)tex in creatortool'}++; 508 $app = 'tex'; 509 } elsif ($creator =~ /dvips/) { 510 $pdf_tools{'dvips in creator'}++; 511 $app = 'tex'; 512 } elsif ($producer =~ /dvips/) { 513 $pdf_tools{'dvips in producer'}++; 514 $app = 'tex'; 515 } elsif ($producer =~ /PrimoPDF/ && $title =~ /\.dvi$/) { 516 $pdf_tools{'primopdf'}++; 517 $app = 'tex'; 518 } elsif (($creator =~ /gnuplot/) && ($producer =~ /Ghostscript|Distiller/)) { 519 # highly likely a tex document 520 $pdf_tools{'gnuplot + gs|dist'}++; 521 $app = 'tex'; 522 } elsif ($producer =~ /Ghostscript|PDFContext|pstopdf|AntennaHouse PDF/ && !$creator && !$title) { 523 # just a producer tag, no other InfoDict metadata... 524 # have yet to see a Word doc that didn't like InfoDict metadata 525 $pdf_tools{'only producer'}++; 526 $app = 'tex'; 527 } elsif ($indirect && $quartzpdf && $tex) { 528 if ($creator || $producer) { 529 print STDERR "$banal_filename: Warning: direct and indirect InfoDict entries\n"; 530 } 531 $pdf_tools{'tex quartzpdf'}++; 532 $app = 'tex'; 533 } elsif ($creatortool =~ /gnuplot/ && !$creator && !$producer && !$title) { 534 $pdf_tools{'only gnuplot'}++; 535 $app = 'tex'; 536 } elsif ($ptex) { 537 $pdf_tools{'pdftex in pdfx'}++; 538 $app = 'tex'; 539 } elsif ($producer =~ /Ghostscript/ && $title =~ /\.pdf$/) { 540 $pdf_tools{'gs ps to pdf'}++; 541 $app = 'tex'; 542 } elsif ($cmrfont) { 543 $pdf_tools{'cmrfont'}++; 544 $app = 'tex'; 545 } 546 547 if ($app) { 548 push (@allapps, $app); 549 $app = ''; 550 } 551 552 # OpenOffice 553 if ($producer =~ /OpenOffice/) { 554 $pdf_tools{'open office'}++; 555 push (@allapps, 'openoffice'); 556 } 557 558 if ($creator =~ /Interleaf/) { 559 $pdf_tools{'interleaf + distiller'}++; 560 push (@allapps, 'interleaf'); 561 } 562 563 # FrameMaker (!) 564 if ($creator =~ /FrameMaker/) { 565 $pdf_tools{'frame'}++; 566 push (@allapps, 'framemaker'); 567 } 568 569 # sanity check that we haven't matched more than one application, 570 # or whether we didn't match anything... 571 if (scalar (@allapps) > 1) { 572 print STDERR "$banal_filename: Error: multiple apps inferred: @allapps\n"; 573 $app = 'unknown'; 574 } elsif (scalar (@allapps) < 1) { 575 print STDERR "$banal_filename: Warning: failed to infer document app, using 'unknown'\n"; 576# print STDERR "$banal_filename: Creator: $creator\n" if ($creator); 577# print STDERR "$banal_filename: Title: $title\n" if ($title); 578# print STDERR "$banal_filename: Producer: $producer\n" if ($producer); 579# print STDERR "$banal_filename: CreatorTool: $creatortool\n" if ($creatortool); 580# print STDERR "$banal_filename: RDFTitle: $rdftitle\n" if ($rdftitle); 581# print STDERR "$banal_filename: PDFProducer: $pdfproducer\n" if ($pdfproducer); 582# print STDERR "$banal_filename: cmrfont\n" if ($cmrfont); 583# print STDERR "$banal_filename: texfont $texfont\n" if ($texfont); 584 $app = 'unknown'; 585 } else { 586 $app = $allapps[0]; 587 } 588 589# $pdf_tools{$app}++; 590 $doc->{app} = $app; 591 592 if ($debug_docapp) { 593 print STDERR "$banal_filename: Creator: $creator\n" if ($creator); 594 print STDERR "$banal_filename: Title: $title\n" if ($title); 595 print STDERR "$banal_filename: Producer: $producer\n" if ($producer); 596 print STDERR "$banal_filename: CreatorTool: $creatortool\n" if ($creatortool); 597 print STDERR "$banal_filename: RDFTitle: $rdftitle\n" if ($rdftitle); 598 print STDERR "$banal_filename: PDFProducer: $pdfproducer\n" if ($pdfproducer); 599 print STDERR "$banal_filename: cmrfont\n" if ($cmrfont); 600 print STDERR "$banal_filename: texfont $texfont\n" if ($texfont); 601 foreach $t (keys %pdf_tools) { 602 print "$t: $pdf_tools{$t}\n"; 603 } 604 } 605 606 return; 607} 608 609sub calc_page_leading ($) { 610 my ($page) = @_; 611# my ($mode) = modevalkey ($page->{pagedata}->{segdata}->{leads}); 612 my ($mode, $segs); 613 614 $segs = $page->{pagedata}->{segdata_byfont}->{$page->{pagedata}->{bodyfont}->{id}}; 615 $mode = modevalkey ($segs->{leads}); 616 617 $count = $segs->{leads}->{$mode} + 618 $segs->{leads}->{$mode - 1} + 619 $segs->{leads}->{$mode + 1}; 620 if ($count <= 0) { 621 $page->{pagespec}->{lead} = 0; 622 return; 623 } 624 625 if ($banal_leading_policy eq 'mode') { 626 print "using leading policy 'mode'\n" if ($debug_leading); 627 $lead = $mode * $p2h_to_points; 628 $lead *= 10; 629 $lead = int ($lead + 0.5); 630 $lead /= 10; 631 print "leading: $lead\n" if ($debug_leading); 632 $page->{pagespec}->{lead} = $lead; 633 return; 634 } 635 636 if ($debug_leading) { 637 # leading histogram 638 $ll = $segs->{leads}; 639 foreach $k (sort { $a <=> $b } keys %$ll) { 640 my ($l) = int (($k * $p2h_to_points * 10) + 0.5); 641 $l /= 10; 642 print "$l ($segs->{leads}->{$k}) "; 643 } 644 print "\n"; 645 } 646 647 $wsum = $mode * ($segs->{leads}->{$mode} / $count); 648 $wsum += ($mode - 1) * ($segs->{leads}->{$mode - 1} / $count); 649 $wsum += ($mode + 1) * ($segs->{leads}->{$mode + 1} / $count); 650 $lead = $wsum * $p2h_to_points; 651 $lead *= 10; 652 $lead = int ($lead + 0.5); 653 $lead /= 10; 654 655 $page->{pagespec}->{lead} = $lead; 656 657# print Dumper ($segs->{leads}); 658} 659 660sub calc_page_columns ($) { 661 my ($page) = @_; 662 my ($segs, $maxw, $colw, $ncols); 663 664 # use estimated width of text region as base 665 my $pagew = $page->{pagespec}->{textbb}->{width}; 666 my $leftmargin = $page->{pagespec}->{textbb}->{left}; 667 my $paperw = $page->{pagespec}->{paperbb}->{width}; 668 my $expected_pagew = $paperw - 2 * max(min($leftmargin, $paperw - $pagew - $leftmargin), 0); 669 $pagew = $expected_pagew if $pagew < 0.9 * $expected_pagew; 670 671 # use the maximum width segment in the body font to estimate 672 # column width 673 $segs = $page->{pagedata}->{segdata_byfont}->{$page->{pagedata}->{bodyfont}->{id}}; 674# $maxw = maxkey ($segs->{widths}); 675 $modew = modevalkey ($segs->{widths}); 676 $colw = $modew / $p2h_per_inch; 677 678 if ($colw >= ($pagew / 2.0)) { 679 $ncols = 1; 680 } elsif (($colw < ($pagew / 2.0)) && ($colw >= ($pagew / 3.0))) { 681 $ncols = 2; 682 } elsif (($colw < ($pagew / 3.0)) && ($colw >= ($pagew / 4.0))) { 683 $ncols = 3; 684 } elsif (($colw < ($pagew / 4.0)) && ($colw >= ($pagew / 5.0))) { 685 $ncols = 4; 686 } elsif (($colw < ($pagew / 5.0)) && ($colw >= ($pagew / 6.0))) { 687 $ncols = 5; 688 } elsif (($colw < ($pagew / 6.0)) && ($colw >= ($pagew / 7.0))) { 689 $ncols = 6; 690 } elsif (($colw < ($pagew / 7.0)) && ($colw >= ($pagew / 8.0))) { 691 $ncols = 7; 692 } elsif ($page->{pagespec}->{density} < $banal_min_density) { 693 $ncols = 1; 694 } else { 695 my ($num) = $page->{num}; 696# print Dumper ($segs->{widths}); 697 printf STDERR "$banal_filename: Error (page $num): Unknown number of columns: width of typical text segment %.2fin, page %.2fin.\n", $colw, $pagew; 698 $ncols = 1; 699 } 700 701 $page->{pagedata}->{ncols} = $ncols; 702 $page->{pagespec}->{ncols} = $ncols; 703} 704 705sub calc_page_text_region ($$) { 706 my ($page, $segdata) = @_; 707 my ($minw, $maxw, $minh, $maxh); 708 my ($segs_minw, $segs_maxw); 709 710 $segs_minw = $segdata->{lefts}; 711 $segs_maxw = $segdata->{rights}; 712 713 # find the minimum left position among segments (must be 714 # multiple segments with that position to skip outliers) 715 $minw = 8 * $p2h_per_inch; 716 717 foreach $s (keys %$segs_minw) { 718 $minw = $s if (($s < $minw) && ($segs_minw->{$s} > 3)); 719 } 720 721 # all consistency bets are off with low density pages 722 $minw = minkey ($segs_minw) if ($minw > 4 * $p2h_per_inch); 723 724 # find the maximum right position among segments (must be 725 # multiple segments with that position to skip outliers) 726 $maxw = 0; 727 foreach $s (keys %$segs_maxw) { 728 $maxw = $s if (($s > $maxw) && ($segs_maxw->{$s} >= 2)); 729 } 730 731# print "tmpw $tmpw maxw $maxw\n"; 732# if ($maxw < 600) { 733# print Dumper ($segs_maxw); 734# } 735 736 # unjustified text may not have multiple segments with the same 737 # max right position...fall back to just using the max right position 738 $maxw = maxkey ($segs_maxw) if ($maxw < $minw); 739 $maxw = $minw + minkey ($segdata->{widths}) if (!defined $maxw); 740 $maxw = $minw if ($maxw < $minw); 741 742 $minh = minkey ($segdata->{tops}); 743 $maxh = maxkey ($segdata->{bots}); 744 745 $page->{pagedata}->{textbb} = { 746 top => $minh, 747 left => $minw, 748 width => ($maxw - $minw), 749 height => ($maxh - $minh), 750 }; 751 752# print "$minw $maxw\n"; 753# print Dumper ($page->{pagedata}->{textbb}); 754 755 $page->{pagespec}->{textbb} = { 756 top => $minh / $p2h_per_inch, 757 left => $minw / $p2h_per_inch, 758 width => ($maxw - $minw) / $p2h_per_inch, 759 height => ($maxh - $minh) / $p2h_per_inch, 760 }; 761 762 return 1; 763} 764 765sub calc_page_density ($) { 766 my ($page) = @_; 767 my ($bfont, $density); 768 769 $bfont = $page->{pagedata}->{bodyfont}->{id}; 770 $density = maxval ($page->{pagedata}->{segdata_byfont}->{$bfont}->{byfont}); 771 $page->{pagespec}->{density} = $density; 772} 773 774sub calc_doc_body_font ($) { 775 my ($doc) = @_; 776 my ($fonts) = {}; 777 778 for $i (1..$doc->{npages}) { 779 $page = $doc->{pages}->{$i}; 780 $fonts->{$page->{pagespec}->{bodyfont}}++; 781 } 782 783 $doc->{pagespec}->{bodyfont} = modevalkey ($fonts); 784} 785 786sub calc_doc_leading ($) { 787 my ($doc) = @_; 788 my ($leads) = {}; 789 my ($lmode, $page); 790 791 for $i (1..$doc->{npages}) { 792 $page = $doc->{pages}->{$i}; 793 $leads->{$page->{pagespec}->{lead}}++; 794 } 795 $lmode = modevalkey ($leads); 796 797# $use_raw_leading = 1; 798 if (!defined $use_raw_leading) { 799# print "mode: $lmode\n"; 800# print "pages w mode: $leads->{$lmode}\n"; 801 if ($leads->{$lmode} >= $doc->{npages} / 2) { 802 for $i (1..$doc->{npages}) { 803 $page = $doc->{pages}->{$i}; 804 next if ($page->{pagespec}->{lead} == $lmode); 805 806# print "abs diff: ", $lmode - $page->{pagespec}->{lead}, "\n"; 807 if (abs ($lmode - $page->{pagespec}->{lead}) < 0.2) { 808# print "setting to ", $lmode, "\n"; 809 $page->{pagespec}->{lead} = $lmode; 810 } 811 } 812 } 813 } 814 815 if ($debug_leading) { 816 817 print "entire doc\n"; 818 819 for $i (1..$doc->{npages}) { 820 $page = $doc->{pages}->{$i}; 821 $segs = $page->{pagedata}->{segdata_byfont}->{$page->{pagedata}->{bodyfont}->{id}}; 822 $leads = $segs->{leads}; 823 foreach $k (keys %$leads) { 824 $doc_leads{$k} += $segs->{leads}->{$k}; 825 } 826 } 827 828 foreach $k (sort { $a <=> $b } keys %doc_leads) { 829 my ($l) = int (($k * $p2h_to_points * 10) + 0.5); 830 $l /= 10; 831 print "$l ($doc_leads{$k}) "; 832 } 833 print "\n"; 834 835 { 836 $mode = modevalkey (\%doc_leads); 837 print "mvk $mode\n"; 838 $count = $doc_leads{$mode} + 839 $doc_leads{$mode - 1} + 840 $doc_leads{$mode + 1}; 841 842 $wsum = $mode * ($doc_leads{$mode} / $count); 843 print "b: ", $wsum, "\n"; 844 $wsum += ($mode - 1) * ($doc_leads{$mode - 1} / $count); 845 $wsum += ($mode + 1) * ($doc_leads{$mode + 1} / $count); 846 $lead = $wsum * $p2h_to_points; 847 print "c: ", $lead, "\n"; 848 $lead *= 10; 849 print "d: ", $lead, "\n"; 850 $lead = int ($lead + 0.5); 851 $lead /= 10; 852 print "lead: $lead\n"; 853 } 854 } 855 856 $doc->{pagespec}->{lead} = $lmode; 857} 858 859sub calc_doc_text_region ($) { 860 my ($doc) = @_; 861 my ($page, $maxw, $maxh, $minl, $mint, $rmarg, $bmarg); 862 863 $page = $doc->{pages}->{1}; 864 $maxw = $page->{pagespec}->{textbb}->{width}; 865 $maxh = $page->{pagespec}->{textbb}->{height}; 866 $minl = $page->{pagespec}->{textbb}->{left}; 867 $mint = $page->{pagespec}->{textbb}->{top}; 868 869 for $i (2..$doc->{npages}) { 870 next if ($page->{density} < $banal_min_density); 871 872 $page = $doc->{pages}->{$i}; 873 $maxw = max $maxw, $page->{pagespec}->{textbb}->{width}; 874 $maxh = max $maxh, $page->{pagespec}->{textbb}->{height}; 875 $minl = min $minl, $page->{pagespec}->{textbb}->{left}; 876 $mint = min $mint, $page->{pagespec}->{textbb}->{top}; 877 } 878 $doc->{textbb}->{width} = $maxw; 879 $doc->{textbb}->{height} = $maxh; 880 $doc->{textbb}->{left} = $minl; 881 $doc->{textbb}->{top} = $mint; 882 883 $rmarg = $doc->{pagespec}->{paperbb}->{width} - ($doc->{textbb}->{width} + $doc->{textbb}->{left}); 884 $bmarg = $doc->{pagespec}->{paperbb}->{height} - ($doc->{textbb}->{height} + $doc->{textbb}->{top}); 885 if ($rmarg < 0) { 886 print STDERR "r MARGIN\n"; 887 } 888 if ($bmarg < 0) { 889 print STDERR "b MARGIN\n"; 890 } 891 $doc->{textbb}->{rmarg} = $rmarg; 892 $doc->{textbb}->{bmarg} = $bmarg; 893} 894 895sub calc_doc_page_types ($) { 896 my ($doc) = @_; 897 my ($page, $font, $type); 898 899 $font = $doc->{pagespec}->{bodyfont}; 900 901 for $i (1..$doc->{npages}) { 902 $page = $doc->{pages}->{$i}; 903 $type = 'body'; 904 905 if ($i == 1 && $page->{pagespec}->{density} < 3000) { 906 $type = 'cover'; 907 } elsif ($page->{pagespec}->{bodyfont} < $font) { 908 if (($doc->{npages} - $i) < ($doc->{npages} / 3)) { 909 $type = 'bib'; 910 } 911 } elsif ($page->{pagespec}->{density} < $banal_min_density) { 912 if ($i == $doc->{npages}) { 913 $type = 'bib'; 914 } else { 915 $type = 'figure'; 916 } 917 } 918 919 $page->{pagespec}->{type} = $type; 920 } 921} 922 923sub calc_doc_columns ($) { 924 my ($doc) = @_; 925 my ($page); 926 my ($cols) = {}; 927 928 for $i (1..$doc->{npages}) { 929 $page = $doc->{pages}->{$i}; 930 $cols->{$page->{pagespec}->{ncols}}++; 931 } 932 933 # number of columns on greatest number of pages 934 $doc->{ncols} = modevalkey ($cols); 935} 936 937sub p2h_font_to_font_size ($) { 938 my ($font) = @_; 939 my ($pt) = ($font->{size} + 3) / $zoom; 940 941 if ($font->{family} eq 'Times' 942 || $font->{family} eq 'Helvetica' 943 || $font->{family} eq 'Courier' 944 || $font->{family} eq 'Symbol') { 945 } else { 946 print STDERR "$banal_filename: Error: Unknown font family.\n"; 947# print Dumper ($font); 948 } 949 950 return $pt; 951} 952 953sub p2h_font_bug ($) { 954 my ($doc) = @_; 955 956 return 1 if ($doc->{pagespec}->{bodyfont} <= 0); 957 return 0; 958} 959 960sub p2h_serious_font_bug ($) { 961 my ($doc) = @_; 962 963 return 0 if (!p2h_font_bug ($doc)); 964 return 1 if ($doc->{textbb}->{width} == 0 || 965 $doc->{textbb}->{height} == 0); 966 return 0; 967} 968 969my %json_escapes = ( 970 "\n" => "\\n", 971 "\r" => "\\r", 972 "\f" => "\\f", 973 "\t" => "\\t", 974 "\"" => "\\\"", 975 "\\" => "\\\\", 976 "/" => "\\/" 977); 978 979sub json_quote ($) { 980 my($x) = $_[0]; 981 $x =~ s{[\n\r\f\t\"\\/]}{$json_escapes{$&}}ge; 982 "\"$x\""; 983} 984 985sub report_json ($) { 986 my ($doc) = @_; 987 988 printf "{\n \"at\": %d,\n", time; 989 printf " \"args\": %s,\n", json_quote(join(" ", @switches)) if @switches; 990 991 my $dx = {"pw" => {}, "ph" => {}, "tw" => {}, "th" => {}, "mt" => {}, "ml" => {}}; 992 my $px = {}, $nummargin = 10000; 993 for my $i (1 .. $doc->{npages}) { 994 my $page = $doc->{pages}->{$i}; 995 my($pbb, $tbb) = ($page->{pagespec}->{paperbb}, $page->{pagespec}->{textbb}); 996 my($tl) = POSIX::floor($tbb->{left} * 72 / $grid) * $grid; 997 my($tt) = POSIX::floor($tbb->{top} * 72 / $grid) * $grid; 998 my($tr) = POSIX::ceil(($tbb->{left} + $tbb->{width}) * 72 / $grid) * $grid; 999 my($tb) = POSIX::ceil(($tbb->{top} + $tbb->{height}) * 72 / $grid) * $grid; 1000 my($pd) = {"pw" => (sprintf "%.0f", $pbb->{width} * 72 / $grid) * $grid, 1001 "ph" => (sprintf "%.0f", $pbb->{height} * 72 / $grid) * $grid, 1002 "mt" => $tt, 1003 "ml" => $tl, 1004 "tw" => $tr - $tl, 1005 "th" => $tb - $tt}; 1006 $px->{$i} = $pd; 1007 my($k, $v); 1008 while (($k, $v) = each %$pd) { 1009 $dx->{$k}->{$v} += 1; 1010 } 1011 my($pnummargin) = POSIX::floor($pd->{ph} - $page->{pagedata}->{lowest_number} * $p2h_to_points); 1012 $nummargin = min($pnummargin, $nummargin) if $pnummargin < $pd->{ph} - $tb; 1013 } 1014 my($pw, $ph) = (modevalkey($dx->{pw}), modevalkey($dx->{ph})); 1015 my($tw, $th) = (modevalkey($dx->{tw}), modevalkey($dx->{th})); 1016 my($mt, $ml) = (modevalkey($dx->{mt}), modevalkey($dx->{ml})); 1017 1018 my ($doc_ps) = sprintf "\"papersize\": [%.0f,%.0f]", $ph, $pw; 1019 my ($doc_margin) = sprintf "\"margin\": [%.0f,%.0f,%.0f,%.0f]", $mt, $pw - ($ml + $tw), $ph - ($mt + $th), $ml; 1020 my ($doc_bfs); 1021 if (p2h_font_bug($doc)) { 1022 $doc_bfs = "\"bodyfontsize\": null"; 1023 } else { 1024 $doc_bfs = sprintf "\"bodyfontsize\": %g", $doc->{pagespec}->{bodyfont}; 1025 } 1026 my ($doc_l) = sprintf "\"leading\": %g", $doc->{pagespec}->{lead}; 1027 my ($doc_c) = sprintf "\"columns\": %d", $doc->{pages}->{1}->{pagespec}->{ncols}; 1028 print " $doc_ps,\n $doc_margin,\n $doc_bfs,\n $doc_l,\n $doc_c,\n"; 1029 printf " \"nummargin\": %.0f,\n", $nummargin if $nummargin < 10000; 1030 print " \"pages\": ["; 1031 $sep = "\n"; 1032 1033 my %pages; 1034 for my $i (1 .. $doc->{npages}) { 1035 my $page = $doc->{pages}->{$i}; 1036 my @val = (); 1037 1038 if ($page->{num} =~ /\A\d+\z/ && $page->{num} ne $i) { 1039 push @val, sprintf "\"pageno\": %d", $page->{num}; 1040 } elsif ($page->{num} ne $i) { 1041 push @val, sprintf "\"pageno\": %s", json_quote($page->{num}); 1042 } 1043 1044 my($pd) = $px->{$i}; 1045 my($page_ps) = sprintf "\"papersize\": [%.0f,%.0f]", $pd->{ph}, $pd->{pw}; 1046 push @val, $page_ps if $page_ps ne $doc_ps; 1047 my($page_margin) = sprintf "\"margin\": [%.0f,%.0f,%.0f,%.0f]", $pd->{mt}, $pd->{pw} - ($pd->{ml} + $pd->{tw}), $pd->{ph} - ($pd->{mt} + $pd->{th}), $pd->{ml}; 1048 push @val, $page_margin if $page_margin ne $doc_margin; 1049 my($page_bfs) = sprintf "\"bodyfontsize\": %g", $page->{pagespec}->{bodyfont}; 1050 push @val, $page_bfs if $page_bfs ne $doc_bfs; 1051 my($page_l) = sprintf "\"leading\": %g", $page->{pagespec}->{lead}; 1052 push @val, $page_l if $page_l ne $doc_l; 1053 my($page_c) = sprintf "\"columns\": %d", $page->{pagespec}->{ncols}; 1054 push @val, $page_c if $page_c ne $doc_c; 1055 push @val, sprintf "\"d\": %d", $page->{pagespec}->{density}; 1056 push @val, sprintf "\"pagetype\": %s", json_quote($page->{pagespec}->{type}) 1057 if $page->{pagespec}->{type} ne "body"; 1058 1059 print $sep, " {", join(", ", @val), "}"; 1060 $sep = ",\n"; 1061 } 1062 print "\n ]\n}\n"; 1063} 1064 1065sub report_verbose ($) { 1066 my ($doc) = @_; 1067 my ($page) = $doc->{pages}->{1}; 1068 1069 print $file, "\n"; 1070 if (p2h_font_bug ($doc)) { 1071 print STDERR $file, "\n"; 1072 print STDERR "$banal_filename: Error: pdftohtml encountered font problems...some info likely bogus.\n"; 1073 } 1074 printf "Paper size: %.2fin x %.2fin\n", $doc->{pagespec}->{paperbb}->{width}, $doc->{pagespec}->{paperbb}->{height}; 1075 printf "Text region: %.2fin x %.2fin\n", $doc->{textbb}->{width}, 1076 $doc->{textbb}->{height}; 1077 printf "Margins: %.2fin x %.2fin x %.2fin x %.2fin (l/r/t/b)\n", 1078 $doc->{textbb}->{left}, 1079 $doc->{textbb}->{rmarg}, 1080 $doc->{textbb}->{top}, 1081 $doc->{textbb}->{bmarg}; 1082 printf "Body font size: %.2fpt", $doc->{pagespec}->{bodyfont}; 1083 if (p2h_font_bug ($doc)) { 1084 print " (bogus)"; 1085 } 1086 print "\n"; 1087 printf "Leading: %.1fpt\n", $doc->{pagespec}->{lead}; 1088 print "Columns: ", $page->{pagespec}->{ncols}, "\n"; 1089 print "Pages: ", $doc->{npages}, "\n"; 1090 print "App: ", $doc->{app}, "\n" if $doc->{app} ne ""; 1091 1092 print "\n"; 1093 for $i (1..$doc->{npages}) { 1094 $page = $doc->{pages}->{$i}; 1095 1096 print "Page $page->{num}:\n"; 1097 printf (" text region: %.2fin x %.2fin\n", $page->{pagespec}->{textbb}->{width}, $page->{pagespec}->{textbb}->{height}); 1098 1099 $left_i = $page->{pagespec}->{textbb}->{left}; 1100 $right_i = $page->{pagespec}->{paperbb}->{width} - 1101 ($left_i + $page->{pagespec}->{textbb}->{width}); 1102 $top_i = $page->{pagespec}->{textbb}->{top}; 1103 $bot_i = $page->{pagespec}->{paperbb}->{height} - 1104 ($top_i + $page->{pagespec}->{textbb}->{height}); 1105 printf " margins: %.2fin x %.2fin x %.2fin x %.2fin (l/r/t/b)\n", 1106 $left_i, $right_i, $top_i, $bot_i; 1107 1108 printf " body font: %gpt (id %d)\n", $page->{pagespec}->{bodyfont}, 1109 $page->{pagedata}->{bodyfont}->{id}; 1110 printf " leading: %gpt\n", $page->{pagespec}->{lead}; 1111 printf " columns: %d\n", $page->{pagespec}->{ncols}; 1112 print " type: ", $page->{pagespec}->{type}, "\n"; 1113 1114 $density = $page->{pagespec}->{density}; 1115 printf " density: %d\n", $density; 1116 } 1117} 1118 1119sub report_stats ($) { 1120 my ($doc) = @_; 1121 my ($page) = $doc->{pages}->{1}; 1122 1123 if (p2h_serious_font_bug ($doc)) { 1124 print STDERR "$banal_filename: Error: pdftohtml encountered font problems...skipping.\n"; 1125 return; 1126 } 1127 1128 if (p2h_font_bug ($doc)) { 1129 print STDERR "$banal_filename: Warning: pdftohtml encountered font problems...some info likely bogus.\n"; 1130 } 1131 1132 printf "$file\t%.2fx%.2f\t%.2fx%.2f\t%.2fx%.2fx%.2fx%.2f\t%d\t%.1f\t%d\t%d\t%s\n", 1133 # page width x height 1134 $doc->{pagespec}->{paperbb}->{width}, 1135 $doc->{pagespec}->{paperbb}->{height}, 1136 # text region width x height 1137 $doc->{textbb}->{width}, 1138 $doc->{textbb}->{height}, 1139 # margins left x right x top x bottom 1140 $doc->{textbb}->{left}, 1141 $doc->{textbb}->{rmarg}, 1142 $doc->{textbb}->{top}, 1143 $doc->{textbb}->{bmarg}, 1144 # body font 1145 $doc->{pagespec}->{bodyfont}, 1146 # leading 1147 $doc->{pagespec}->{lead}, 1148 # columns 1149 $doc->{pagespec}->{ncols}, 1150 # pages 1151 $doc->{npages}, 1152 # app 1153 $doc->{app}; 1154} 1155 1156sub judge_paper_size ($$) { 1157 my ($doc, $spec) = @_; 1158 my ($msg) = ''; 1159 my ($w, $h); 1160 1161 $w = $doc->{pagespec}->{paperbb}->{width}; 1162 $h = $doc->{pagespec}->{paperbb}->{height}; 1163 if ($spec->{paper} eq 'letter') { 1164 $paperw = 8.5; 1165 $paperh = 11; 1166 } elsif ($spec->{paper} eq 'A4') { 1167 $paperw = 8.26; 1168 $paperh = 11.69; 1169 } 1170 1171 unless (((($paperw - $banal_text_fudge) < $w) && 1172 (($paperw + $banal_text_fudge) > $w)) && 1173 ((($paperh - $banal_text_fudge) < $h) && 1174 (($paperh + $banal_text_fudge) > $h))) { 1175 $msg = sprintf ("Paper size: %.2f x %.2f is not $spec->{paper} size\n", 1176 $w, $h); 1177 } 1178 1179 return $msg; 1180} 1181 1182sub judge_page_count ($$) { 1183 my ($doc, $spec) = @_; 1184 my ($msg) = ''; 1185 1186 if ($doc->{npages} > $spec->{pages}) { 1187 $msg = sprintf ("Pages: too many pages %d (max %d)\n", 1188 $doc->{npages}, $spec->{pages}); 1189 } elsif ($spec->{min_pages} && 1190 ($doc->{npages} < $spec->{min_pages})) { 1191 $msg = sprintf ("Pages: too few pages %d (min %d)\n", 1192 $doc->{npages}, $spec->{min_pages}); 1193 } 1194 1195 return $msg; 1196} 1197 1198sub judge_body_font ($$) { 1199 my ($doc, $spec) = @_; 1200 my ($msg) = ''; 1201 my ($i, $font); 1202 1203 if (p2h_font_bug ($doc)) { 1204 $msg .= "Font: Cannot judge, no font info derived from pdf\n"; 1205 return $msg; 1206 } 1207 1208 if ($doc->{pagespec}->{bodyfont} < $spec->{font}) { 1209 $msg .= sprintf ("Font: body font too small %dpt (min %dpt)\n", 1210 $doc->{pagespec}->{bodyfont}, $spec->{font}); 1211 } 1212 return $msg; 1213} 1214 1215sub app_msg ($) { 1216 my ($doc) = @_; 1217 return ($doc->{app} ne "" ? " using " . $doc->{app} : ""); 1218} 1219 1220sub judge_leading ($$) { 1221 my ($doc, $spec) = @_; 1222 my ($msg) = ''; 1223 my ($lead); 1224 1225 $lead = $doc->{pagespec}->{lead}; 1226 if (($spec->{lead} - 0.1) > $lead) { 1227 $msg .= sprintf ("Leading: too small %.1fpt (min %.1fpt)%s\n", 1228 $lead, $spec->{lead}, app_msg($doc)); 1229 } 1230} 1231 1232sub judge_columns ($$) { 1233 my ($doc, $spec) = @_; 1234 my ($msg) = ''; 1235 my ($i, $page); 1236 1237 # should add a 'strict' option 1238 if ($doc->{ncols} > $spec->{cols}) { 1239 $msg = sprintf ("Columns: found %d columns, expecting %d\n", 1240 $doc->{ncols}, $spec->{cols}); 1241 } 1242 1243 return $msg if (1); 1244 1245 # skip last page 1246 for $i (1..($doc->{npages} - 1)) { 1247 $page = $doc->{pages}->{$i}; 1248 1249 next if ($page->{pagespec}->{density} < $banal_min_density); 1250 1251 next unless ($spec->{cols} != $page->{pagespec}->{ncols}); 1252 1253 $msg = sprintf ("Columns: found %d columns, expecting %d\n", 1254 $page->{pagespec}->{ncols}, $spec->{cols}); 1255 last; 1256 } 1257 1258 return $msg; 1259} 1260 1261sub judge_text_region ($$) { 1262 my ($doc, $spec) = @_; 1263 my ($wmsg, $hmsg) = ('', ''); 1264 my ($i, $page); 1265 my ($width, $height, $width_fail, $height_fail); 1266 1267 $width_fail = 0; 1268 for $i (1..$doc->{npages}) { 1269 $page = $doc->{pages}->{$i}; 1270 1271 # ignore pages without much text 1272 next if ($page->{pagespec}->{density} < $banal_min_density); 1273 1274 $width = $page->{pagespec}->{textbb}->{width}; 1275 next unless ($spec->{width} && 1276 ($width > ($spec->{width} + $spec->{fudge}))); 1277 $width_fail++; 1278 1279 $wmsg = sprintf ("Width: text too wide %.2fin (max %.2fin)\n", 1280 $width, $spec->{width}); 1281 } 1282 1283 # if a small number of pages fail the width spec, it is likely 1284 # due to tables or figures extending into the margin. 1285 # only check on reasonably long docs. 1286 if ($doc->{npages} > (($banal_judge_min_fail_pages - 1) * 2)) { 1287 if ($width_fail < $banal_judge_min_fail_pages) { 1288 $wmsg = ''; 1289 } 1290 } 1291 1292 1293 $height_fail = 0; 1294 for $i (1..$doc->{npages}) { 1295 $page = $doc->{pages}->{$i}; 1296 1297 next if ($page->{pagespec}->{density} < $banal_min_density); 1298 1299 $height = $page->{pagespec}->{textbb}->{height}; 1300 next unless ($spec->{height} && 1301 ($height > ($spec->{height} + $spec->{fudge}))); 1302 $height_fail++; 1303 1304 $hmsg = sprintf ("Height: text too high %.2fin (max %.2fin)\n", 1305 $height, $spec->{height}); 1306 } 1307 1308 # if a small number of pages fail the height spec, it is likely 1309 # due to tables or figures extending into the margin. 1310 # only check on reasonably long docs. 1311 if ($doc->{npages} > (($banal_judge_min_fail_pages - 1) * 2)) { 1312 if ($height_fail < $banal_judge_min_fail_pages) { 1313 $hmsg = ''; 1314 } 1315 } 1316 1317# $hmsg .= sprintf ("Fail: width $width_fail height $height_fail\n"); 1318 1319 return $wmsg . $hmsg; 1320} 1321 1322sub pass_judgement ($$) { 1323 my ($doc, $spec) = @_; 1324 my ($page); 1325 my ($msg) = ''; 1326 my ($err); 1327 1328 if (p2h_serious_font_bug ($doc)) { 1329 print STDERR "$banal_filename: Error: pdftohtml encountered font problems...skipping.\n"; 1330 return; 1331 } 1332 1333 $msg .= judge_paper_size ($doc, $spec) if ($spec->{paper}); 1334 $msg .= judge_page_count ($doc, $spec) if ($spec->{pages}); 1335 $msg .= judge_body_font ($doc, $spec) if ($spec->{font}); 1336 $msg .= judge_leading ($doc, $spec) if ($spec->{lead}); 1337 $msg .= judge_columns ($doc, $spec) if ($spec->{cols}); 1338 $msg .= judge_text_region ($doc, $spec) if ($spec->{width} || $spec->{height}); 1339 1340 return if (!$msg); 1341 1342 if ($format eq 'list') { 1343 chop $msg; # remove trailing newline 1344 $msg =~ s/\n/,/g; # convert newlines to commas 1345 print basename ($file), ",$msg\n"; 1346 } else { 1347 $msg =~ s/^(.)/ $1/mg; # indent 1348 print $file, ":\n"; 1349 print $msg; 1350 } 1351} 1352 1353sub parse_p2h_fonts ($$) { 1354 my ($line, $page) = @_; 1355 my (%fonts, $font, $fontid); 1356 1357 while (1) { 1358# print "p2h_font: $line"; 1359 return $line if ($line =~ /<\/page>/); 1360 1361 last unless ($line =~ /<fontspec id=\"(\d+)\" size=\"([-]*\d+)\" family=\"([A-Za-z0-9]+)\" color=\"(\#[a-fA-F0-9]+)\"\/>/); 1362 1363 $font = { id => $1, size => $2, family => $3, color => $4 }; 1364 $fontid = "$3//$2//$4"; 1365 if (exists $fonts{$fontid}) { 1366 $font->{id} = $fonts{$fontid}; 1367 } else { 1368 $fonts{$fontid} = $1; 1369 } 1370 $page->{doc}->{fonts}{$1} = $font; 1371 1372 $line = <$FILE>; 1373 } 1374 1375 return $line; 1376} 1377 1378sub update_segdata ($$$) { 1379 my ($page, $segdata, $seg) = @_; 1380 my ($top, $left, $width, $height, $font, $lead) = @$seg; 1381 my ($bottom) = $top + $height; 1382 my ($right) = $left + $width; 1383 my ($pagew) = $page->{pagedata}->{pagebb}->{width}; 1384 1385 $segdata->{widths}{$width}++ if ($width > $p2h_per_inch); 1386 $segdata->{lefts}{$left}++ if ($left < ($pagew / 3)); 1387 $segdata->{rights}{$right}++ if ($right > ($pagew / 3)); 1388 $segdata->{tops}{$top}++ if ($width > $p2h_per_inch); 1389 $segdata->{bots}{$bottom}++ if ($width > $p2h_per_inch); 1390# $segdata->{leads}{$lead}++ if ($lead > 0 && $width > $p2h_per_inch); 1391 $segdata->{leads}{$lead}++ if ($lead > 0); 1392 1393 # count number of segments in a given font size, weighted by the 1394 # width of the segment. the font with the greatest weight 1395 # will be the body font. 1396 1397 $segdata->{byfont}{$font} += $width; 1398} 1399 1400sub check_p2h_error ($) { 1401 my ($line) = @_; 1402 1403 # check for pdftohtml error strings embedded in output 1404 return 1 if ($line =~ /^stroke seems to be a pattern/); 1405 1406 return 0; 1407} 1408 1409sub parse_p2h_text ($$) { 1410 my ($line, $page) = @_; 1411 my ($top, $bottom, $left, $right, $width, $height, $font); 1412 my ($text, $lead, $prevheight); 1413 1414 $segs_all = {}; 1415 $segs_byfont = {}; 1416 1417 $prevheight = 0; 1418 $lowest_number = 0; 1419 1420 while (1) { 1421# next if (check_p2h_error ($line)); 1422 1423 unless ($line =~ /<text top=\"(-?\d+)\" left=\"(-?\d+)\" width=\"(-?\d+)\" height=\"(-?\d+)\" font=\"(-?\d+)\"/) { 1424 # if we didn't match a <text>, then it should be an end of page or <image> 1425 if ($line =~ /<image/) { 1426 $line = <$FILE>; 1427 next; 1428 } 1429 unless ($line =~ /<\/page>/) { 1430 if ($debug_parse) { 1431 print STDERR "$banal_filename: Curious, expecting a </page> but found:\n"; 1432 print STDERR $line; 1433 } 1434 } 1435 last; 1436 } 1437 1438 $height = $1; 1439 if ($prevheight < $height) { 1440 $lead = $height - $prevheight; 1441 } else { 1442 $lead = -1; 1443 } 1444 $prevheight = $height; 1445 1446 @seginfo = ($1, $2, $3, $4, $5, $lead); 1447 if (($font = $page->{doc}->{fonts}{$5})) { 1448 $seginfo[4] = $font->{id}; 1449 } 1450 1451 # sanity check the data somewhat...text from embedded figures 1452 # can produce surprising values 1453 if ($1 < 0 || $2 < 0 || 1454 ($1 > $page->{pagedata}->{pagebb}->{height}) || 1455 ($2 > $page->{pagedata}->{pagebb}->{width})) { 1456 $line = <$FILE>; 1457 next; 1458 } 1459 1460 $nsegs++; 1461 1462 $segs_byfont->{$seginfo[4]} = {} 1463 unless (defined $segs_byfont->{$seginfo[4]}); 1464 $byfont = $segs_byfont->{$seginfo[4]}; 1465 update_segdata ($page, $byfont, \@seginfo); 1466 $segs_byfont{$seginfo[4]} = $byfont; 1467 update_segdata ($page, $segs_all, \@seginfo); 1468 1469 # page number detection 1470 if ($line =~ /<text[^>]*>[- ,.\/]*[0-9][- ,.\/0-9]*<\/text>/) { 1471 my($bottom) = $seginfo[0] + $seginfo[3]; 1472 $lowest_number = max $lowest_number, $bottom; 1473 } 1474 1475 # embedded newlines will split <text>...</text> across multiple lines 1476 if ($line !~ /<\/text>/) { 1477 while ($line = <$FILE>) { 1478 print STDERR "$banal_filename: skipping: $line" if ($debug_parse); 1479 last if ($line =~/<\/text>/); 1480 } 1481 } 1482 1483 $line = <$FILE>; 1484 } 1485 1486 1487 $page->{pagedata}->{nsegs} = $nsegs; 1488 $page->{pagedata}->{segdata} = $segs_all; 1489 $page->{pagedata}->{segdata_byfont} = $segs_byfont; 1490 $page->{pagedata}->{lowest_number} = $lowest_number; 1491 1492 calc_page_body_font ($page); 1493 calc_page_leading ($page); 1494 calc_page_density ($page); 1495 calc_page_text_region ($page, $segs_all); 1496 calc_page_columns ($page); 1497} 1498 1499sub parse_p2h_page ($) { 1500 my ($doc) = @_; 1501 1502 # assume we've just read the header 1503 $line = <$FILE>; 1504 1505 # skip any error strings embedded between pages 1506 while (check_p2h_error ($line)) { 1507 print STDERR "$banal_filename: skipping p2h error string: $line" if ($debug_parse); 1508 $line = <$FILE>; 1509 } 1510 1511 if ($line !~ /<page/ && $line =~ /<outline>/) { 1512 my($nout) = 0; 1513 while (1) { 1514 ++$nout if $line =~ /<outline>/; 1515 --$nout if $line =~ /<\/outline>/; 1516 last if $nout == 0; 1517 $line = <$FILE>; 1518 } 1519 $line = <$FILE> if $line =~ /<\/outline>\s*$/; 1520 } 1521 1522 unless ($line =~ /<page number=\"(\d+)\" position=\"([A-Za-z0-9]+\") top=\"(\d+)\" left=\"(\d+)\" height=\"(\d+)\" width=\"(\d+)\"/) { 1523 return '' if ($line =~ /<\/pdf2xml/); 1524 print STDERR "$banal_filename: Error: \"<page ...\" node expected for page ", $doc->{npages} + 1, "\n"; 1525 chomp $line; 1526 print STDERR "-> '$line'\n"; 1527 return ''; 1528 } 1529 1530 # initialize page data structures 1531 $pagebb = { 1532 top => $3, 1533 left => $4, 1534 height => $5, 1535 width => $6, 1536 }; 1537 1538 $paperbb = { 1539 top => $3 / $p2h_per_inch, 1540 left => $4 / $p2h_per_inch, 1541 height => $5 / $p2h_per_inch, 1542 width => $6 / $p2h_per_inch, 1543 }; 1544 1545 $page = { 1546 doc => $doc, 1547 num => $1, 1548 pagedata => { 1549 pagebb => $pagebb, 1550 }, 1551 pagespec => { 1552 paperbb => $paperbb, 1553 }, 1554 }; 1555 1556 # check for optional fontspecs at start of page 1557 $line = <$FILE>; 1558 if ($line =~ /<fontspec/) { 1559 $line = parse_p2h_fonts ($line, $page); 1560 } elsif ($debug_parse) { 1561 print STDERR "$banal_filename: Curious, no fontspec on page, found:\n"; 1562 print STDERR "$line"; 1563 } 1564 1565 1566 # process text segments 1567 if ($line =~ /<(?:text|image)/) { 1568 parse_p2h_text ($line, $page); 1569 } elsif ($debug_parse) { 1570 print STDERR "$banal_filename: Curious, empty page $page->{num}, found:\n"; 1571 print STDERR "$line"; 1572 } 1573 1574 return $page; 1575} 1576 1577sub parse_p2h_header ($) { 1578 my ($doc) = @_; 1579 1580 while (<$FILE>) { 1581 return 1 if (/<pdf2xml/); 1582 } 1583 return 0; 1584} 1585 1586sub merge_page ($$) { 1587 my ($doc, $page) = @_; 1588 1589 $doc->{npages}++; 1590 $doc->{pages}->{$page->{num}} = $page; 1591 1592 # initialize doc spec with first page spec 1593 if ($page->{num} == 1) { 1594 $doc->{pagespec}->{paperbb} = $page->{pagespec}->{paperbb}; 1595 $doc->{pagespec}->{textbb} = $page->{pagespec}->{textbb}; 1596 $doc->{pagespec}->{bodyfont} = $page->{pagespec}->{bodyfont}; 1597 $doc->{pagespec}->{ncols} = $page->{pagespec}->{ncols}; 1598 return; 1599 } 1600} 1601 1602sub banal_get_spec () { 1603 my ($s) = {}; 1604 1605 return $s unless (defined $judge); 1606 1607 if (defined $paper) { 1608 if ($paper ne 'letter' && $paper ne 'A4') { 1609 die ("$banal_filename: Error: Unknown paper type '$paper'.\n"); 1610 } 1611 $s->{paper} = $paper; 1612 } 1613 $s->{pages} = $pages if (defined $pages); 1614 $s->{font} = $font if (defined $font); 1615 $s->{lead} = $leading if (defined $leading); 1616 $s->{cols} = $cols if (defined $cols); 1617 if (defined $width) { 1618 $s->{width} = $width; 1619 $s->{fudge} = $banal_text_fudge; 1620 } 1621 if (defined $height) { 1622 $s->{height} = $height; 1623 $s->{fudge} = $banal_text_fudge; 1624 } 1625 if (defined $fudge) { 1626 $s->{fudge} = $fudge; 1627 } 1628 return $s; 1629} 1630 1631sub banal_report_spec ($) { 1632 my ($spec) = @_; 1633 1634 print "Judging: "; 1635 print "$spec->{paper}, " if ($spec->{paper}); 1636 print "$spec->{width}in x $spec->{height}in (~$spec->{fudge}), " if ($spec->{width} || $spec->{height}); 1637 print "$spec->{font}pt font, " if ($spec->{font}); 1638 print "$spec->{lead}pt leading, " if ($spec->{lead}); 1639 print "$spec->{cols} cols, " if ($spec->{cols}); 1640 print "$spec->{pages} pages" if ($spec->{pages}); 1641 print "\n"; 1642 print "- - - - - - - - - - - - - - - - - - \n"; 1643} 1644 1645sub banal_file ($$) { 1646 my ($file, $spec) = @_; 1647 1648 # initialize doc data structure 1649 $doc = { 1650 width => 0, 1651 height => 0, 1652 npages => 0, 1653 ncols => 0, 1654 fonts => {}, 1655 pages => {}, 1656 textbb => {}, 1657 app => '', 1658 fullpath => '', 1659 filename => '', 1660 }; 1661 1662 $doc->{fullpath} = $file; 1663 $banal_fullpath = $file; 1664 $doc->{filename} = basename ($file); 1665 $banal_filename = basename ($file); 1666 1667 if (!parse_p2h_header ($doc)) { 1668 print STDERR "$banal_filename: Error: No pdftohtml output...corrupted pdf file?\n"; 1669 return; 1670 } 1671 1672 calc_doc_app ($doc) if !$no_app; 1673 1674 while ($page = parse_p2h_page ($doc)) { 1675 merge_page ($doc, $page); 1676 } 1677 1678 calc_doc_body_font ($doc); 1679 calc_doc_leading ($doc); 1680 calc_doc_text_region ($doc); 1681 calc_doc_page_types ($doc); 1682 calc_doc_columns ($doc); 1683 1684 if (defined $judge) { 1685 pass_judgement ($doc, $spec); 1686 } elsif (defined $stats) { 1687 report_stats ($doc); 1688 } elsif (defined $json) { 1689 report_json ($doc); 1690 } else { 1691 report_verbose ($doc); 1692 } 1693} 1694 1695sub shell_quote ($) { 1696 my($s) = @_; 1697 $s =~ s/\'/\'\"\'\"\'/g; 1698 return "'$s'"; 1699} 1700 1701sub banal_open_input ($) { 1702 my ($fname) = @_; 1703 my ($base, $ext, $cmd, $oname); 1704 1705 if ($fname =~ /(.+)\.(.+)/) { 1706 ($base, $ext) = ($1, $2); 1707 } else { 1708 print STDERR "$fname: Error: Unable to determine file type from extension.\n"; 1709 return 0; 1710 } 1711 1712 # 2>&1 1713 if ($ext =~ /^pdf$/i) { 1714 ($FILE, $oname) = File::Temp::tempfile("banalXXXXX", UNLINK => 1, SUFFIX => ".xml", TMPDIR => 1); 1715 $zoomarg = "-zoom $zoom"; 1716 $cmd = "$pdftohtml -enc UTF-8 -xml -i $zoomarg " . shell_quote($fname) . " " . shell_quote($oname) . " 2>&1"; 1717 print STDERR "$cmd\n" if ($debug_pdftohtml); 1718 1719 my($ignore_output) = `$cmd`; 1720 unless (-s $FILE) { 1721 print STDERR "$fname: Error: Failed to open file.\n"; 1722 return 0; 1723 } 1724 } elsif ($ext =~ /^xml$/i) { 1725 unless (open ($FILE, "$fname")) { 1726 print STDERR "$fname: Error: Failed to open file.\n"; 1727 return 0; 1728 } 1729 } else { 1730 print STDERR "$fname: Error: Failed to open file.\n"; 1731 return 0; 1732 } 1733 binmode ($FILE, ":utf8"); 1734 1735 return 1; 1736} 1737 1738sub banal_config_p2h ($) { 1739 my ($fname) = @_; 1740 $fname = basename($fname); 1741 my ($major, $minor, $poppler); 1742 1743 if (!defined($zoom)) { 1744 unless (open(P2H, "$pdftohtml -v 2>&1 |")) { 1745 print STDERR "$fname: Error: Failed to run $pdftohtml.\n"; 1746 while (defined($_ = <P2H>)) { 1747 print STDERR; 1748 } 1749 return 0; 1750 } 1751 while (defined($_ = <P2H>)) { 1752 $poppler = 1 if /Poppler/; 1753 next unless (/pdftohtml version (\d+\.\d+)([a-z]*)/); 1754 $p2h_version = "$1$2"; 1755 $major = $1; 1756 $minor = $2; 1757 } 1758 close (P2H); 1759 1760 if (($major >= 0.40) && $minor && (($minor cmp "c") >= 0)) { 1761 # configure for versions 0.40c and above 1762 $zoom = 10; 1763 } else { 1764 $zoom = 3; 1765 } 1766 } 1767 1768 if ($leading_policy) { 1769 $banal_leading_policy = $leading_policy; 1770 } else { 1771 # use a default policy according to the zoom level we can use 1772 # at low zoom, interpolate 1773 if ($zoom >= 10) { 1774 $banal_leading_policy = 'mode'; 1775 } else { 1776 $banal_leading_policy = 'interpolate'; 1777 } 1778 } 1779 1780 print "leading policy: $banal_leading_policy\n" if ($debug_leading); 1781 1782 1783 $p2h_per_inch = 72 * $zoom; 1784 $p2h_to_points = 72 / $p2h_per_inch; 1785 1786 return 1; 1787} 1788 1789sub banal_version () { 1790 print "Banal version $banal_version.\n"; 1791 return 0; 1792} 1793 1794sub main () { 1795 my ($spec); 1796 1797 return banal_version () if (defined $version); 1798 1799 usage if ($#ARGV < 0); 1800 1801 $spec = banal_get_spec (); 1802 banal_report_spec ($spec) if (defined $judge); 1803 1804 if (!banal_config_p2h ($ARGV[0])) { 1805 return 1; 1806 } 1807 1808 foreach $file (@ARGV) { 1809 # open input file into FILE 1810 next unless (banal_open_input ($file)); 1811 banal_file ($file, $spec); 1812 close $FILE; 1813 } 1814 return 0; 1815} 1816 1817exit (main ()); 1818 1819# 1820# 2011-1-25 1821# (utf8revbin_undo): new function; a tool incorrectly reverses multibytes 1822# 1823# 2011-1-19 1824# (check_p2h_error): skip 'stroke seems...' pdftohtml output that can 1825# appear between page output. 1826# 1827# 2011-1-18 1828# (parse_p2h_page, parse_p2h_text, parse_p2h_fonts): handle <text> 1829# segments that span multiple lines from embedded newlines accurately. 1830# handle optional <fontspecc> commands more gracefully. 1831# (debug_parse): new flag. 1832# 1833# 2011-1-17 1834# (update_segdata): fix reporting negative leadings. 1835# 1836# 2011-1-11 1837# (utf8ascii_undo, utf8bin_undo, utf8hex_undo, calc_doc_app): new functions 1838# for inferring the application used to create the document. 1839# (report_verbose, report_stats, judge_leading): report doc application. 1840# (debug_docapp): new flag. 1841# 1842# 2011-1-07 1843# uniformly print filename in error messages. 1844# 1845# 2010-12-31 1846# 1847# (judge_format): new flag, option 'list' reports all violations 1848# on a single line in CSV format. default option 'lines' is original 1849# behavior with one per line. 1850# 1851