1#!/usr/bin/perl
2#
3# Copyright (C) 2007 Geoffrey M. Voelker
4# Copyright (c) 2016-2018 Eddie Kohler; see LICENSE.
5#
6# banal -- analyze pdf formatting
7#
8# This program is free software; you can redistribute it and/or modify
9# it under the terms of the GNU General Public License as published by
10# the Free Software Foundation; either version 2 of the License, or
11# (at your option) any later version.
12#
13# This program is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16# GNU General Public License for more details.
17#
18# You should have received a copy of the GNU General Public License
19# along with this program.  If not, see <http://www.gnu.org/licenses/>.
20#
21# Geoffrey M. Voelker (voelker@cs.ucsd.edu)
22#
23
24# todo:
25# -- computer modern roman fonts
26# -- embedded java script, remoteapproach.com
27
28use Data::Dumper;
29use File::Basename;
30use File::Temp;
31use POSIX;
32use List::Util qw(min max);
33my($FILE, $banal_text_fudge);
34
35sub usage {
36    print <<EOF;
37usage: banal [-report | -stats | -judge [specs]] [-zoom=N] files
38
39banal has three modes of operation:
40
41-report  print full formatting info for all pages.  this mode is
42         the default if no mode is specified:
43
44         % banal paper.pdf
45
46-stats   print formatting info condensed into one line with fields
47         separated by tabs; useful for computing summary stats across
48         many papers.
49
50         fields are 'file', 'paper', 'text region', 'margins', 'font',
51         'leading', 'columns', 'pages', 'app'.  for example:
52
53         % banal -stats *.pdf | cut -f 5
54
55         extracts font sizes from a set of pdf files.
56
57-judge   compare document formatting against a set of formatting
58         specifications:
59
60         -paper=type     paper type ('letter' and 'A4' currently supported)
61         -pages=num      max number of pages
62         -font=num       min font size
63         -leading=num    min leading
64         -cols=num       max columns
65         -width=inches   max text region width
66         -height=inches  max text region height
67         -fudge=inches   text region fudge factor (helps with latex
68                         overflow; default is $banal_text_fudge inches)
69
70         specifications can consist of any and all elements in any
71         combination.  for example:
72
73         % banal -judge -paper=letter -pages=14 -font=10 -leading=12 -width=6.5 -height=9 *.pdf
74
75         will check whether a set of pdf files conforms to formatting specs
76         that require 8.5" x 11" paper, max 14 pages, min 10 point font,
77         min 12 point leading, and a max text region of 6.5" x 9".
78
79-format=lines|list
80
81         lines   report format violations on multiple lines (default)
82
83         list    report format violations on a single line separated by a
84                 comma (e.g., for importing into a spreadsheet).
85
86         % banal -judge -format=list [specs] *.pdf
87
88  -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -
89
90-zoom=N   passed to pdftohtml.
91
92-no_app   do not calculate application
93
94-json     JSON output
95
96-version  report the version of banal
97
98EOF
99      exit(1);
100}
101
102# version
103$banal_version = 1.2;
104
105# parse args
106local($report, $stats, $judge, $no_app, $json, $version, $debug_pdftohtml,
107      $paper, $pages, $font, $leading, $cols, $width, $height, $fudge, $format, $zoom);
108for (my $i = 0; $i < @ARGV; ) {
109    no strict "refs";
110    if ($ARGV[$i] =~ /\A--?(report|stats|judge|no[-_]app|json|version|debug[-_]pdftohtml)\z/) {
111        my($name) = $1;
112        $name =~ s/-/_/g;
113        ${$name} = 1;
114        splice @ARGV, $i, 1;
115    } elsif ($ARGV[$i] =~ /\A--?(paper|pages|font|leading|cols|width|height|fudge|format|zoom)=(.*)\z/) {
116        ${$1} = $2;
117        splice @ARGV, $i, 1;
118    } elsif ($ARGV[$i] =~ /\A--?(paper|pages|font|leading|cols|width|height|fudge|format|zoom)\z/ && $i + 1 < @ARGV) {
119        ${$1} = $ARGV[$i + 1];
120        splice @ARGV, $i, 2;
121    } elsif ($ARGV[$i] =~ /\A-/) {
122        print STDERR "banal: bad option ", $ARGV[$i], "\n";
123        usage;
124    } else {
125        $i += 1;
126    }
127}
128
129my(@switches);
130push @switches, "-zoom=$zoom" if defined $zoom;
131
132# zoom value
133if ((defined $zoom) && ($zoom !~ /^[1-9]\d*(\.\d*)?$/)) {
134    print STDERR "banal: bad -zoom\n";
135    usage;
136}
137
138# mapping from pdftohtml units to inches
139#$p2h_per_inch = 72;
140my $p2h_per_inch;
141
142# scale factor from pdftohtml units to points
143#$p2h_to_points = 72 / $p2h_per_inch;
144my $p2h_to_points;
145
146# minimum amount of text on page for it to be interesting
147my $banal_min_density = 8000;
148
149# fudge factor when judging text regions (in inches).
150$banal_text_fudge = 0.05;
151
152# minimum number of pages that have to fail the text region specs.
153# often papers have 1-2 pages where text on a table or figure extends
154# into the margin.  when judging an entire paper, we'll let those slide...
155my $banal_judge_min_fail_pages = 3;
156
157# policy to use to estimate leading
158my $banal_leading_policy;
159
160# round margins and text blocks to this number of points
161my $grid = 4;
162
163# pdftohtml executable
164my $pdftohtml;
165if (exists $ENV{"PDFTOHTML"}) {
166    $pdftohtml = $ENV{"PDFTOHTML"};
167} elsif (exists $ENV{"PHP_PDFTOHTML"}) {
168    $pdftohtml = $ENV{"PHP_PDFTOHTML"};
169} elsif (defined $pdftohtml_prog) {
170    $pdftohtml = $pdftohtml_prog;
171} else {
172    $pdftohtml = "pdf-to-html";
173}
174
175#print STDERR "using $pdftohtml...\n";
176
177# version of pdftohtml program
178my $p2h_version = 0;
179
180# full path of file being analyzed
181my $banal_fullpath = '';
182# file name of file being analyzed
183my $banal_filename = '';
184
185my $use_raw_leading;
186my $title = '';
187
188# return min key in hash
189sub minkey ($) {
190    my ($href) = @_;
191    return (sort { $a <=> $b } keys %$href)[0];
192}
193
194# return max key in hash
195sub maxkey ($) {
196    my ($href) = @_;
197    return (sort { $a <=> $b } keys %$href)[$#_ - 1];
198}
199
200# return key of mode of values in hash
201sub modevalkey ($) {
202    my ($href) = @_;
203    my ($mode) = (keys %$href)[0];
204    map { $mode = $_ if ($href->{$_} > $href->{$mode}) } keys %$href;
205    return $mode;
206}
207
208# return max val in hash
209sub maxval ($) {
210    my ($href) = @_;
211    my ($max) = (keys %$href)[0];
212    map { $max = $_ if ($href->{$_} > $href->{$max}) } keys %$href;
213    return $href->{$max};
214}
215
216# return 'a' == 'b'
217sub bb_equal ($$) {
218    my ($a, $b) = @_;
219    return (($a->{top} == $b->{top}) &&
220            ($a->{left} == $b->{left}) &&
221            ($a->{height} == $b->{height}) &&
222            ($a->{width} == $b->{width}));
223}
224
225# merge 'a' into 'b'
226sub bb_merge ($$) {
227    my ($a, $b) = @_;
228
229    $b->{top} = min $a->{top}, $b->{top};
230    $b->{left} = min $a->{left}, $b->{left};
231    $b->{height} = max $a->{height}, $b->{height};
232    $b->{width} = max $a->{width}, $b->{width};
233}
234
235sub calc_page_body_font ($) {
236    my ($page) = @_;
237    my ($mode) = modevalkey ($page->{pagedata}->{segdata}->{byfont});
238    $page->{pagedata}->{bodyfont} = $page->{doc}->{fonts}->{$mode};
239    $page->{pagespec}->{bodyfont} = p2h_font_to_font_size ($page->{pagedata}->{bodyfont});
240    if ($page->{pagespec}->{bodyfont} == 0) {
241        print STDERR "$banal_filename: Error: Zero font on page $page->{num}, font id $mode\n";
242    }
243}
244
245sub utf8ascii_undo ($) {
246    my ($str) = @_;
247
248    return $str unless ($str =~ /^\\376\\377(\\\d\d\d.)*$/);
249
250    # string is UTF-8 in ASCII (not binary)
251    #   (PDFCreator seems to like to do this, also freepdfconvert)
252    print "$banal_filename: ascii UTF-8: $title\n" if ($debug_docapp);
253
254    $str =~ s/\\376\\377//;
255    $str =~ s/\\000//g;
256
257    print "$banal_filename: unencoded: $str\n" if ($debug_docapp);
258    return $str;
259}
260
261sub utf8bin_undo ($) {
262    my ($str) = @_;
263
264    return $str unless ($str =~ /^\376\377(\000.)*$/);
265
266    # string is UTF-8 in binary
267    print "$banal_filename: bin UTF-8: $str\n" if ($debug_docapp);
268
269    $str =~ s/\376\377//;
270    $str =~ s/\000//g;
271
272    print "$banal_filename: unencoded $str\n" if ($debug_docapp);
273    return $str;
274}
275
276sub utf8revbin_undo ($) {
277    my ($str) = @_;
278
279    # bytes reversed: character then null bytes (ScanSoft on the Mac)
280
281    return $str unless ($str =~ /^\377\376(.\000)*$/);
282
283    # string is UTF-8 in binary
284    print "$banal_filename: rev bin UTF-8: $str\n" if ($debug_docapp);
285
286    $str =~ s/\377\376//;
287    $str =~ s/\000//g;
288
289    print "$banal_filename: unencoded $str\n" if ($debug_docapp);
290    return $str;
291}
292
293sub utf8hex_undo ($) {
294    my ($str) = @_;
295
296    return $str unless ($str =~ /^FEFF(00..)*$/i);
297
298    print "$banal_filename: hex UTF-8: $str\n" if ($debug_docapp);
299
300    $str =~ s/^FEFF//i;
301    $str =~ s/00//g;
302    print "$banal_filename: hex ascii: $str\n" if ($debug_docapp);
303    $str = pack ("H*", $str);
304
305    print "$banal_filename: packed $str\n" if ($debug_docapp);
306    return $str;
307}
308
309# inferring the document application has two steps:
310#   1) extracting the doc metadata
311#   2) mapping metadata info to an application
312#
313# for (1), ideally we could use a module or tool to extract the
314# InfoDict from the end of the pdf file.  but there are some cases
315# where we need to peek outside the InfoDict for additional hints, so
316# in the end we still have to scan through the pdf file ourselves.
317#
318# for (2), the world would be a simpler place if applications followed
319# some kind of convention.  but given the large combination of apps,
320# pdf converters, and OSes, of course the world is not so simple.  so,
321# as usual, it's back to heuristics gathered from samples...
322
323sub calc_doc_app ($) {
324    my ($doc) = @_;
325    my ($fname) = $doc->{fullpath};
326
327    my ($creator, $producer, $creatortool, $ptex);
328    my ($rdftitle, $pdfproducer);
329    my ($indirect, $quartzpdf, $pdfmachine, $cmrfont, $texfont);
330
331    $creator = $title = $producer = $creatortool = $ptex = '';
332    $rdftitle = $pdfproducer =  '';
333    $indirect = $quartzpdf = $pdfmachine = $cmrfont = $texfont = 0;
334
335    my ($app, @allapps);
336    $app = '';
337    @allapps = ();
338
339    if (!open (PDF, $fname)) {
340        print STDERR "$banal_filename: Error: Failed to open $fname for inferring doc app.";
341        $doc->{app} = 'unknown';
342        return;
343    }
344
345    while (<PDF>) {
346
347        if (m|\/Creator\s*\(([^\)]+)\)|) {
348            $creator = $1;
349        } elsif (m|\/Creator\s*<([^\)]+)>|) {
350            # UTF-8 ascii hex
351            $creator = utf8hex_undo ($1);
352        } elsif (m|\/Creator \d+ \d+ R|) {
353            # Indirection:
354            # << /Producer 313 0 R /Creator 314 0 R ...
355            $indirect = 1;
356        }
357
358        if (m|\/Title\s*\(([^\)]+)\)|) {
359            $title = $1;
360        } elsif (m|\/Title\s*<([^\)]+)>|) {
361            # UTF-8 ascii hex
362            $title = utf8hex_undo ($1);
363        } elsif (m|<dc:title>.+<rdf:li.+>(.+)</rdf:li>.+</dc:title>|) {
364            $rdftitle = $1;
365        } elsif (m|<dc:title>|) {
366            unless (m|</dc:title>|) {
367                while (<PDF>) {
368                    last if (m|</dc:title>|);
369                    next unless (m|<rdf:li.+>(.+)</rdf:li>|);
370                    $rdftitle = $1;
371                }
372            }
373        }
374
375        if (m|\/Producer\s*\(([^\)]+)\)|) {
376            $producer = $1;
377        } elsif (m|\/Producer\s*<([^\)]+)>|) {
378            # UTF-8 ascii hex
379            $producer = utf8hex_undo ($1);
380        } elsif (m|<pdf:Producer>(.+)</pdf:Producer>|) {
381            $pdfproducer = $1;
382        }
383
384        # xap: Adobe Extensible Authoring and Publishing (early name, 5.0)
385        # xmp: Adobe Extensible Metadata Platform (final name)
386        if (m|<x[am]p:CreatorTool>(.+)<\/x[am]p:CreatorTool>|) {
387            $creatortool = $1;
388        }
389
390        if (m|<pdfx:PTEX|) {
391            # <pdfx:PTEX.Fullbanner>This is pdfTeX...</pdfx:PTEX.Fullbanner>
392            $ptex = 1;
393        }
394
395        if (m|\(Mac OS.+Quartz PDFContext\)|) {
396            # (Mac OS X 10.6.2 Quartz PDFContext) [producer indirection]
397            $quartzpdf = 1;
398        } elsif (m|\(TeX\)|) {
399            # (TeX) [creator indirection]
400            $tex = 1;
401        } elsif (m|% created by pdfMachine|) {
402            # tool doesn't bother to create any metadata whatsoever...
403            $pdfmachine = 1;
404        }
405
406        if (!$cmrfont && m|(\/BaseFont\s*\/\w+\+[Cc][Mm][Rr]\d+)|) {
407            # /BaseFont/EGYAWT+CMR8
408            $pdf_tools{'cmr fonts'}++;
409            $cmrfont = 1;
410        } elsif (!$texfont && m|/BaseFont\s*/\w+\+([Cc][Mm]\w\w\d+)|) {
411            $pdf_tools{'tex fonts'}++;
412            $texfont = $1;
413        }
414
415    }
416
417    close (PDF);
418
419    # undo any UTF-8 in ascii (literally "\376\377\000P\000r\000o...")
420    $title = utf8ascii_undo ($title) if ($title);
421    $creator = utf8ascii_undo ($creator) if ($creator);
422    $producer = utf8ascii_undo ($producer) if ($producer);
423    $creatortool = utf8ascii_undo ($creatortool) if ($creatortool);
424    $rdftitle = utf8ascii_undo  ($rdftitle) if ($rdftitle);
425    $pdfproducer = utf8ascii_undo ($pdfproducer) if ($pdfproducer);
426
427    # undo any UTF-8 in binary
428    $title = utf8bin_undo ($title) if ($title);
429    $creator = utf8bin_undo ($creator) if ($creator);
430    $producer = utf8bin_undo ($producer) if ($producer);
431    $creatortool = utf8bin_undo ($creatortool) if ($creatortool);
432    $rdftitle = utf8bin_undo  ($rdftitle) if ($rdftitle);
433    $pdfproducer = utf8bin_undo ($pdfproducer) if ($pdfproducer);
434
435    # undo any UTF-8 in binary (reversed)
436    $title = utf8revbin_undo ($title) if ($title);
437    $creator = utf8revbin_undo ($creator) if ($creator);
438    $producer = utf8revbin_undo ($producer) if ($producer);
439    $creatortool = utf8revbin_undo ($creatortool) if ($creatortool);
440    $rdftitle = utf8revbin_undo  ($rdftitle) if ($rdftitle);
441    $pdfproducer = utf8revbin_undo ($pdfproducer) if ($pdfproducer);
442
443    $title = $rdftitle if (!$title && $rdftitle);
444
445    # Word
446    if ($creator =~ /Microsoft.+Word/) {
447        # Mac OS Quartz PDFContext, doPDF
448        $pdf_tools{'word in creator'}++;
449        $app = 'word';
450    } elsif ($title =~ /Microsoft Word \-/) {
451        # ps->pdf w/ gs, distiller
452        # often doc name in title after '-' (but not always)
453        $pdf_tools{'gs, distiller'}++;
454        $app = 'word';
455    } elsif ($title =~ /Proceedings Template \- WORD/i) {
456        $pdf_tools{'template'}++;
457        $app = 'word';
458    } elsif ($creator =~ /easyPDF/) {
459        # BCL easyPDF
460        $pdf_tools{'easyPDF'}++;
461        $app = 'word';
462    } elsif ($creator =~ /PDFCreator/) {
463        $pdf_tools{'PDFCreator'}++;
464        $app = 'word';
465    } elsif ($creator =~ /PDFMaker.+Word/) {
466        $pdf_tools{'PDFMaker'}++;
467        $app = 'word';
468    } elsif ($creator =~ /Sonic PDF/) {
469        $pdf_tools{'sonic pdf'}++;
470        $app = 'word';
471    } elsif ($creatortool =~ /Word/) {
472        # Adobe XMP metadata
473        $pdf_tools{'Acrobat PDFMaker'}++;
474        $app = 'word';
475    } elsif ($producer =~ /freepdfconvert|deskPDF|ReportLab|PDF reDirect/) {
476        $pdf_tools{'misc pdf tools'}++;
477        $app = 'word';
478#    } elsif ($creator =~ /\000M\000i\000c\000r\000o\000s\000o\000f\000t.+\000W\000o\000r\000d/i) {
479        # UTF-8 binary
480#       $pdf_tools{'Word (UTF-8)'}++;
481#       $app = 'word';
482    } elsif ($pdfmachine) {
483        $pdf_tools{'pdfmachine'}++;
484        $app = 'word';
485    } elsif ($title =~ /\.docx?$/i) {
486        # Amyuni puts the filename in the title
487        $pdf_tools{'doc(x) extension'}++;
488        $app = 'word';
489    }
490
491    if ($app) {
492        push (@allapps, $app);
493        $app = '';
494
495        # never seen this happen, but let's sanity check...
496        if ($cmrfont) {
497            print STDERR "$banal_filename: Warning: CMR font in Word doc?\n";
498            $pdf_tools{'** cmrfont in word doc'}++;
499        }
500    }
501
502    # TeX
503    if ($creator =~ /TeX/) {
504        $pdf_tools{'tex in creator'}++;
505        $app = 'tex';
506    } elsif ($creatortool =~ /(MiK)?TeX/) {
507        $pdf_tools{'(mik)tex in creatortool'}++;
508        $app = 'tex';
509    } elsif ($creator =~ /dvips/) {
510        $pdf_tools{'dvips in creator'}++;
511        $app = 'tex';
512    } elsif ($producer =~ /dvips/) {
513        $pdf_tools{'dvips in producer'}++;
514        $app = 'tex';
515    } elsif ($producer =~ /PrimoPDF/ && $title =~ /\.dvi$/) {
516        $pdf_tools{'primopdf'}++;
517        $app = 'tex';
518    } elsif (($creator =~ /gnuplot/) && ($producer =~ /Ghostscript|Distiller/)) {
519        # highly likely a tex document
520        $pdf_tools{'gnuplot + gs|dist'}++;
521        $app = 'tex';
522    } elsif ($producer =~ /Ghostscript|PDFContext|pstopdf|AntennaHouse PDF/ && !$creator && !$title) {
523        # just a producer tag, no other InfoDict metadata...
524        # have yet to see a Word doc that didn't like InfoDict metadata
525        $pdf_tools{'only producer'}++;
526        $app = 'tex';
527    } elsif ($indirect && $quartzpdf && $tex) {
528        if ($creator || $producer) {
529            print STDERR "$banal_filename: Warning: direct and indirect InfoDict entries\n";
530        }
531        $pdf_tools{'tex quartzpdf'}++;
532        $app = 'tex';
533    } elsif ($creatortool =~ /gnuplot/ && !$creator && !$producer && !$title) {
534        $pdf_tools{'only gnuplot'}++;
535        $app = 'tex';
536    } elsif ($ptex) {
537        $pdf_tools{'pdftex in pdfx'}++;
538        $app = 'tex';
539    } elsif ($producer =~ /Ghostscript/ && $title =~ /\.pdf$/) {
540        $pdf_tools{'gs ps to pdf'}++;
541        $app = 'tex';
542    } elsif ($cmrfont) {
543        $pdf_tools{'cmrfont'}++;
544        $app = 'tex';
545    }
546
547    if ($app) {
548        push (@allapps, $app);
549        $app = '';
550    }
551
552    # OpenOffice
553    if ($producer =~ /OpenOffice/) {
554        $pdf_tools{'open office'}++;
555        push (@allapps, 'openoffice');
556    }
557
558    if ($creator =~ /Interleaf/) {
559        $pdf_tools{'interleaf + distiller'}++;
560        push (@allapps, 'interleaf');
561    }
562
563    # FrameMaker (!)
564    if ($creator =~ /FrameMaker/) {
565        $pdf_tools{'frame'}++;
566        push (@allapps, 'framemaker');
567    }
568
569    # sanity check that we haven't matched more than one application,
570    # or whether we didn't match anything...
571    if (scalar (@allapps) > 1) {
572        print STDERR "$banal_filename: Error: multiple apps inferred: @allapps\n";
573        $app = 'unknown';
574    } elsif (scalar (@allapps) < 1) {
575        print STDERR "$banal_filename: Warning: failed to infer document app, using 'unknown'\n";
576#       print STDERR "$banal_filename:   Creator: $creator\n" if ($creator);
577#       print STDERR "$banal_filename:   Title: $title\n" if ($title);
578#       print STDERR "$banal_filename:   Producer: $producer\n" if ($producer);
579#       print STDERR "$banal_filename:   CreatorTool: $creatortool\n" if ($creatortool);
580#       print STDERR "$banal_filename:   RDFTitle: $rdftitle\n" if ($rdftitle);
581#       print STDERR "$banal_filename:   PDFProducer: $pdfproducer\n" if ($pdfproducer);
582#       print STDERR "$banal_filename:   cmrfont\n" if ($cmrfont);
583#       print STDERR "$banal_filename:   texfont $texfont\n" if ($texfont);
584        $app = 'unknown';
585    } else {
586        $app = $allapps[0];
587    }
588
589#    $pdf_tools{$app}++;
590    $doc->{app} = $app;
591
592    if ($debug_docapp) {
593        print STDERR "$banal_filename: Creator: $creator\n" if ($creator);
594        print STDERR "$banal_filename: Title: $title\n" if ($title);
595        print STDERR "$banal_filename: Producer: $producer\n" if ($producer);
596        print STDERR "$banal_filename: CreatorTool: $creatortool\n" if ($creatortool);
597        print STDERR "$banal_filename: RDFTitle: $rdftitle\n" if ($rdftitle);
598        print STDERR "$banal_filename: PDFProducer: $pdfproducer\n" if ($pdfproducer);
599        print STDERR "$banal_filename: cmrfont\n" if ($cmrfont);
600        print STDERR "$banal_filename: texfont $texfont\n" if ($texfont);
601        foreach $t (keys %pdf_tools) {
602            print "$t: $pdf_tools{$t}\n";
603        }
604    }
605
606    return;
607}
608
609sub calc_page_leading ($) {
610    my ($page) = @_;
611#    my ($mode) = modevalkey ($page->{pagedata}->{segdata}->{leads});
612    my ($mode, $segs);
613
614    $segs = $page->{pagedata}->{segdata_byfont}->{$page->{pagedata}->{bodyfont}->{id}};
615    $mode = modevalkey ($segs->{leads});
616
617    $count = $segs->{leads}->{$mode} +
618        $segs->{leads}->{$mode - 1} +
619        $segs->{leads}->{$mode + 1};
620    if ($count <= 0) {
621        $page->{pagespec}->{lead} = 0;
622        return;
623    }
624
625    if ($banal_leading_policy eq 'mode') {
626        print "using leading policy 'mode'\n" if ($debug_leading);
627        $lead = $mode * $p2h_to_points;
628        $lead *= 10;
629        $lead = int ($lead + 0.5);
630        $lead /= 10;
631        print "leading: $lead\n" if ($debug_leading);
632        $page->{pagespec}->{lead} = $lead;
633        return;
634    }
635
636    if ($debug_leading) {
637        # leading histogram
638        $ll = $segs->{leads};
639        foreach $k (sort { $a <=> $b } keys %$ll) {
640            my ($l) = int (($k * $p2h_to_points * 10) + 0.5);
641            $l /= 10;
642            print "$l ($segs->{leads}->{$k}) ";
643        }
644        print "\n";
645    }
646
647    $wsum = $mode * ($segs->{leads}->{$mode} / $count);
648    $wsum += ($mode - 1) * ($segs->{leads}->{$mode - 1} / $count);
649    $wsum += ($mode + 1) * ($segs->{leads}->{$mode + 1} / $count);
650    $lead = $wsum * $p2h_to_points;
651    $lead *= 10;
652    $lead = int ($lead + 0.5);
653    $lead /= 10;
654
655    $page->{pagespec}->{lead} = $lead;
656
657#    print Dumper ($segs->{leads});
658}
659
660sub calc_page_columns ($) {
661    my ($page) = @_;
662    my ($segs, $maxw, $colw, $ncols);
663
664    # use estimated width of text region as base
665    my $pagew = $page->{pagespec}->{textbb}->{width};
666    my $leftmargin = $page->{pagespec}->{textbb}->{left};
667    my $paperw = $page->{pagespec}->{paperbb}->{width};
668    my $expected_pagew = $paperw - 2 * max(min($leftmargin, $paperw - $pagew - $leftmargin), 0);
669    $pagew = $expected_pagew if $pagew < 0.9 * $expected_pagew;
670
671    # use the maximum width segment in the body font to estimate
672    # column width
673    $segs = $page->{pagedata}->{segdata_byfont}->{$page->{pagedata}->{bodyfont}->{id}};
674#    $maxw = maxkey ($segs->{widths});
675    $modew = modevalkey ($segs->{widths});
676    $colw = $modew / $p2h_per_inch;
677
678    if ($colw >= ($pagew / 2.0)) {
679        $ncols = 1;
680    } elsif (($colw < ($pagew / 2.0)) && ($colw >= ($pagew / 3.0))) {
681        $ncols = 2;
682    } elsif (($colw < ($pagew / 3.0)) && ($colw >= ($pagew / 4.0))) {
683        $ncols = 3;
684    } elsif (($colw < ($pagew / 4.0)) && ($colw >= ($pagew / 5.0))) {
685        $ncols = 4;
686    } elsif (($colw < ($pagew / 5.0)) && ($colw >= ($pagew / 6.0))) {
687        $ncols = 5;
688    } elsif (($colw < ($pagew / 6.0)) && ($colw >= ($pagew / 7.0))) {
689        $ncols = 6;
690    } elsif (($colw < ($pagew / 7.0)) && ($colw >= ($pagew / 8.0))) {
691        $ncols = 7;
692    } elsif ($page->{pagespec}->{density} < $banal_min_density) {
693        $ncols = 1;
694    } else {
695        my ($num) = $page->{num};
696#       print Dumper ($segs->{widths});
697        printf STDERR "$banal_filename: Error (page $num): Unknown number of columns: width of typical text segment %.2fin, page %.2fin.\n", $colw, $pagew;
698        $ncols = 1;
699    }
700
701    $page->{pagedata}->{ncols} = $ncols;
702    $page->{pagespec}->{ncols} = $ncols;
703}
704
705sub calc_page_text_region ($$) {
706    my ($page, $segdata) = @_;
707    my ($minw, $maxw, $minh, $maxh);
708    my ($segs_minw, $segs_maxw);
709
710    $segs_minw = $segdata->{lefts};
711    $segs_maxw = $segdata->{rights};
712
713    # find the minimum left position among segments (must be
714    # multiple segments with that position to skip outliers)
715    $minw = 8 * $p2h_per_inch;
716
717    foreach $s (keys %$segs_minw) {
718        $minw = $s if (($s < $minw) && ($segs_minw->{$s} > 3));
719    }
720
721    # all consistency bets are off with low density pages
722    $minw = minkey ($segs_minw) if ($minw > 4 * $p2h_per_inch);
723
724    # find the maximum right position among segments (must be
725    # multiple segments with that position to skip outliers)
726    $maxw = 0;
727    foreach $s (keys %$segs_maxw) {
728        $maxw = $s if (($s > $maxw) && ($segs_maxw->{$s} >= 2));
729    }
730
731#    print "tmpw $tmpw maxw $maxw\n";
732#    if ($maxw < 600) {
733#       print Dumper ($segs_maxw);
734#    }
735
736    # unjustified text may not have multiple segments with the same
737    # max right position...fall back to just using the max right position
738    $maxw = maxkey ($segs_maxw) if ($maxw < $minw);
739    $maxw = $minw + minkey ($segdata->{widths}) if (!defined $maxw);
740    $maxw = $minw if ($maxw < $minw);
741
742    $minh = minkey ($segdata->{tops});
743    $maxh = maxkey ($segdata->{bots});
744
745    $page->{pagedata}->{textbb} = {
746        top => $minh,
747        left => $minw,
748        width => ($maxw - $minw),
749        height => ($maxh - $minh),
750    };
751
752#    print "$minw $maxw\n";
753#    print Dumper ($page->{pagedata}->{textbb});
754
755    $page->{pagespec}->{textbb} = {
756        top => $minh / $p2h_per_inch,
757        left => $minw / $p2h_per_inch,
758        width => ($maxw - $minw) / $p2h_per_inch,
759        height => ($maxh - $minh) / $p2h_per_inch,
760    };
761
762    return 1;
763}
764
765sub calc_page_density ($) {
766    my ($page) = @_;
767    my ($bfont, $density);
768
769    $bfont = $page->{pagedata}->{bodyfont}->{id};
770    $density = maxval ($page->{pagedata}->{segdata_byfont}->{$bfont}->{byfont});
771    $page->{pagespec}->{density} = $density;
772}
773
774sub calc_doc_body_font ($) {
775    my ($doc) = @_;
776    my ($fonts) = {};
777
778    for $i (1..$doc->{npages}) {
779        $page = $doc->{pages}->{$i};
780        $fonts->{$page->{pagespec}->{bodyfont}}++;
781    }
782
783    $doc->{pagespec}->{bodyfont} = modevalkey ($fonts);
784}
785
786sub calc_doc_leading ($) {
787    my ($doc) = @_;
788    my ($leads) = {};
789    my ($lmode, $page);
790
791    for $i (1..$doc->{npages}) {
792        $page = $doc->{pages}->{$i};
793        $leads->{$page->{pagespec}->{lead}}++;
794    }
795    $lmode = modevalkey ($leads);
796
797#    $use_raw_leading = 1;
798    if (!defined $use_raw_leading) {
799#       print "mode: $lmode\n";
800#       print "pages w mode: $leads->{$lmode}\n";
801        if ($leads->{$lmode} >= $doc->{npages} / 2) {
802            for $i (1..$doc->{npages}) {
803                $page = $doc->{pages}->{$i};
804                next if ($page->{pagespec}->{lead} == $lmode);
805
806#               print "abs diff: ", $lmode - $page->{pagespec}->{lead}, "\n";
807                if (abs ($lmode - $page->{pagespec}->{lead}) < 0.2) {
808#                   print "setting to ", $lmode, "\n";
809                    $page->{pagespec}->{lead} = $lmode;
810                }
811            }
812        }
813    }
814
815    if ($debug_leading) {
816
817        print "entire doc\n";
818
819        for $i (1..$doc->{npages}) {
820            $page = $doc->{pages}->{$i};
821            $segs = $page->{pagedata}->{segdata_byfont}->{$page->{pagedata}->{bodyfont}->{id}};
822            $leads = $segs->{leads};
823            foreach $k (keys %$leads) {
824                $doc_leads{$k} += $segs->{leads}->{$k};
825            }
826        }
827
828        foreach $k (sort { $a <=> $b } keys %doc_leads) {
829            my ($l) = int (($k * $p2h_to_points * 10) + 0.5);
830            $l /= 10;
831            print "$l ($doc_leads{$k}) ";
832        }
833        print "\n";
834
835        {
836            $mode = modevalkey (\%doc_leads);
837            print "mvk $mode\n";
838            $count = $doc_leads{$mode} +
839                $doc_leads{$mode - 1} +
840                $doc_leads{$mode + 1};
841
842            $wsum = $mode * ($doc_leads{$mode} / $count);
843            print "b: ", $wsum, "\n";
844            $wsum += ($mode - 1) * ($doc_leads{$mode - 1} / $count);
845            $wsum += ($mode + 1) * ($doc_leads{$mode + 1} / $count);
846            $lead = $wsum * $p2h_to_points;
847            print "c: ", $lead, "\n";
848            $lead *= 10;
849            print "d: ", $lead, "\n";
850            $lead = int ($lead + 0.5);
851            $lead /= 10;
852            print "lead: $lead\n";
853        }
854    }
855
856    $doc->{pagespec}->{lead} = $lmode;
857}
858
859sub calc_doc_text_region ($) {
860    my ($doc) = @_;
861    my ($page, $maxw, $maxh, $minl, $mint, $rmarg, $bmarg);
862
863    $page = $doc->{pages}->{1};
864    $maxw = $page->{pagespec}->{textbb}->{width};
865    $maxh = $page->{pagespec}->{textbb}->{height};
866    $minl = $page->{pagespec}->{textbb}->{left};
867    $mint = $page->{pagespec}->{textbb}->{top};
868
869    for $i (2..$doc->{npages}) {
870        next if ($page->{density} < $banal_min_density);
871
872        $page = $doc->{pages}->{$i};
873        $maxw = max $maxw, $page->{pagespec}->{textbb}->{width};
874        $maxh = max $maxh, $page->{pagespec}->{textbb}->{height};
875        $minl = min $minl, $page->{pagespec}->{textbb}->{left};
876        $mint = min $mint, $page->{pagespec}->{textbb}->{top};
877    }
878    $doc->{textbb}->{width} = $maxw;
879    $doc->{textbb}->{height} = $maxh;
880    $doc->{textbb}->{left} = $minl;
881    $doc->{textbb}->{top} = $mint;
882
883    $rmarg = $doc->{pagespec}->{paperbb}->{width} - ($doc->{textbb}->{width} + $doc->{textbb}->{left});
884    $bmarg = $doc->{pagespec}->{paperbb}->{height} - ($doc->{textbb}->{height} + $doc->{textbb}->{top});
885    if ($rmarg < 0) {
886        print STDERR "r MARGIN\n";
887    }
888    if ($bmarg < 0) {
889        print STDERR "b MARGIN\n";
890    }
891    $doc->{textbb}->{rmarg} = $rmarg;
892    $doc->{textbb}->{bmarg} = $bmarg;
893}
894
895sub calc_doc_page_types ($) {
896    my ($doc) = @_;
897    my ($page, $font, $type);
898
899    $font = $doc->{pagespec}->{bodyfont};
900
901    for $i (1..$doc->{npages}) {
902        $page = $doc->{pages}->{$i};
903        $type = 'body';
904
905        if ($i == 1 && $page->{pagespec}->{density} < 3000) {
906            $type = 'cover';
907        } elsif ($page->{pagespec}->{bodyfont} < $font) {
908            if (($doc->{npages} - $i) < ($doc->{npages} / 3)) {
909                $type = 'bib';
910            }
911        } elsif ($page->{pagespec}->{density} < $banal_min_density) {
912            if ($i == $doc->{npages}) {
913                $type = 'bib';
914            } else {
915                $type = 'figure';
916            }
917        }
918
919        $page->{pagespec}->{type} = $type;
920    }
921}
922
923sub calc_doc_columns ($) {
924    my ($doc) = @_;
925    my ($page);
926    my ($cols) = {};
927
928    for $i (1..$doc->{npages}) {
929        $page = $doc->{pages}->{$i};
930        $cols->{$page->{pagespec}->{ncols}}++;
931    }
932
933    # number of columns on greatest number of pages
934    $doc->{ncols} = modevalkey ($cols);
935}
936
937sub p2h_font_to_font_size ($) {
938    my ($font) = @_;
939    my ($pt) = ($font->{size} + 3) / $zoom;
940
941    if ($font->{family} eq 'Times'
942        || $font->{family} eq 'Helvetica'
943        || $font->{family} eq 'Courier'
944        || $font->{family} eq 'Symbol') {
945    } else {
946        print STDERR "$banal_filename: Error: Unknown font family.\n";
947#       print Dumper ($font);
948    }
949
950    return $pt;
951}
952
953sub p2h_font_bug ($) {
954    my ($doc) = @_;
955
956    return 1 if ($doc->{pagespec}->{bodyfont} <= 0);
957    return 0;
958}
959
960sub p2h_serious_font_bug ($) {
961    my ($doc) = @_;
962
963    return 0 if (!p2h_font_bug ($doc));
964    return 1 if ($doc->{textbb}->{width} == 0 ||
965                 $doc->{textbb}->{height} == 0);
966    return 0;
967}
968
969my %json_escapes = (
970    "\n" => "\\n",
971    "\r" => "\\r",
972    "\f" => "\\f",
973    "\t" => "\\t",
974    "\"" => "\\\"",
975    "\\" => "\\\\",
976    "/" => "\\/"
977);
978
979sub json_quote ($) {
980    my($x) = $_[0];
981    $x =~ s{[\n\r\f\t\"\\/]}{$json_escapes{$&}}ge;
982    "\"$x\"";
983}
984
985sub report_json ($) {
986    my ($doc) = @_;
987
988    printf "{\n  \"at\": %d,\n", time;
989    printf "  \"args\": %s,\n", json_quote(join(" ", @switches)) if @switches;
990
991    my $dx = {"pw" => {}, "ph" => {}, "tw" => {}, "th" => {}, "mt" => {}, "ml" => {}};
992    my $px = {}, $nummargin = 10000;
993    for my $i (1 .. $doc->{npages}) {
994        my $page = $doc->{pages}->{$i};
995        my($pbb, $tbb) = ($page->{pagespec}->{paperbb}, $page->{pagespec}->{textbb});
996        my($tl) = POSIX::floor($tbb->{left} * 72 / $grid) * $grid;
997        my($tt) = POSIX::floor($tbb->{top} * 72 / $grid) * $grid;
998        my($tr) = POSIX::ceil(($tbb->{left} + $tbb->{width}) * 72 / $grid) * $grid;
999        my($tb) = POSIX::ceil(($tbb->{top} + $tbb->{height}) * 72 / $grid) * $grid;
1000        my($pd) = {"pw" => (sprintf "%.0f", $pbb->{width} * 72 / $grid) * $grid,
1001                   "ph" => (sprintf "%.0f", $pbb->{height} * 72 / $grid) * $grid,
1002                   "mt" => $tt,
1003                   "ml" => $tl,
1004                   "tw" => $tr - $tl,
1005                   "th" => $tb - $tt};
1006        $px->{$i} = $pd;
1007        my($k, $v);
1008        while (($k, $v) = each %$pd) {
1009            $dx->{$k}->{$v} += 1;
1010        }
1011        my($pnummargin) = POSIX::floor($pd->{ph} - $page->{pagedata}->{lowest_number} * $p2h_to_points);
1012        $nummargin = min($pnummargin, $nummargin) if $pnummargin < $pd->{ph} - $tb;
1013    }
1014    my($pw, $ph) = (modevalkey($dx->{pw}), modevalkey($dx->{ph}));
1015    my($tw, $th) = (modevalkey($dx->{tw}), modevalkey($dx->{th}));
1016    my($mt, $ml) = (modevalkey($dx->{mt}), modevalkey($dx->{ml}));
1017
1018    my ($doc_ps) = sprintf "\"papersize\": [%.0f,%.0f]", $ph, $pw;
1019    my ($doc_margin) = sprintf "\"margin\": [%.0f,%.0f,%.0f,%.0f]", $mt, $pw - ($ml + $tw), $ph - ($mt + $th), $ml;
1020    my ($doc_bfs);
1021    if (p2h_font_bug($doc)) {
1022        $doc_bfs = "\"bodyfontsize\": null";
1023    } else {
1024        $doc_bfs = sprintf "\"bodyfontsize\": %g", $doc->{pagespec}->{bodyfont};
1025    }
1026    my ($doc_l) = sprintf "\"leading\": %g", $doc->{pagespec}->{lead};
1027    my ($doc_c) = sprintf "\"columns\": %d", $doc->{pages}->{1}->{pagespec}->{ncols};
1028    print "  $doc_ps,\n  $doc_margin,\n  $doc_bfs,\n  $doc_l,\n  $doc_c,\n";
1029    printf "  \"nummargin\": %.0f,\n", $nummargin if $nummargin < 10000;
1030    print "  \"pages\": [";
1031    $sep = "\n";
1032
1033    my %pages;
1034    for my $i (1 .. $doc->{npages}) {
1035        my $page = $doc->{pages}->{$i};
1036        my @val = ();
1037
1038        if ($page->{num} =~ /\A\d+\z/ && $page->{num} ne $i) {
1039            push @val, sprintf "\"pageno\": %d", $page->{num};
1040        } elsif ($page->{num} ne $i) {
1041            push @val, sprintf "\"pageno\": %s", json_quote($page->{num});
1042        }
1043
1044        my($pd) = $px->{$i};
1045        my($page_ps) = sprintf "\"papersize\": [%.0f,%.0f]", $pd->{ph}, $pd->{pw};
1046        push @val, $page_ps if $page_ps ne $doc_ps;
1047        my($page_margin) = sprintf "\"margin\": [%.0f,%.0f,%.0f,%.0f]", $pd->{mt}, $pd->{pw} - ($pd->{ml} + $pd->{tw}), $pd->{ph} - ($pd->{mt} + $pd->{th}), $pd->{ml};
1048        push @val, $page_margin if $page_margin ne $doc_margin;
1049        my($page_bfs) = sprintf "\"bodyfontsize\": %g", $page->{pagespec}->{bodyfont};
1050        push @val, $page_bfs if $page_bfs ne $doc_bfs;
1051        my($page_l) = sprintf "\"leading\": %g", $page->{pagespec}->{lead};
1052        push @val, $page_l if $page_l ne $doc_l;
1053        my($page_c) = sprintf "\"columns\": %d", $page->{pagespec}->{ncols};
1054        push @val, $page_c if $page_c ne $doc_c;
1055        push @val, sprintf "\"d\": %d", $page->{pagespec}->{density};
1056        push @val, sprintf "\"pagetype\": %s", json_quote($page->{pagespec}->{type})
1057            if $page->{pagespec}->{type} ne "body";
1058
1059        print $sep, "    {", join(", ", @val), "}";
1060        $sep = ",\n";
1061    }
1062    print "\n  ]\n}\n";
1063}
1064
1065sub report_verbose ($) {
1066    my ($doc) = @_;
1067    my ($page) = $doc->{pages}->{1};
1068
1069    print $file, "\n";
1070    if (p2h_font_bug ($doc)) {
1071        print STDERR $file, "\n";
1072        print STDERR "$banal_filename: Error: pdftohtml encountered font problems...some info likely bogus.\n";
1073    }
1074    printf "Paper size: %.2fin x %.2fin\n", $doc->{pagespec}->{paperbb}->{width}, $doc->{pagespec}->{paperbb}->{height};
1075    printf "Text region: %.2fin x %.2fin\n", $doc->{textbb}->{width},
1076           $doc->{textbb}->{height};
1077    printf "Margins: %.2fin x %.2fin x %.2fin x %.2fin (l/r/t/b)\n",
1078           $doc->{textbb}->{left},
1079           $doc->{textbb}->{rmarg},
1080           $doc->{textbb}->{top},
1081           $doc->{textbb}->{bmarg};
1082    printf "Body font size: %.2fpt", $doc->{pagespec}->{bodyfont};
1083    if (p2h_font_bug ($doc)) {
1084        print " (bogus)";
1085    }
1086    print "\n";
1087    printf "Leading: %.1fpt\n", $doc->{pagespec}->{lead};
1088    print "Columns: ", $page->{pagespec}->{ncols}, "\n";
1089    print "Pages: ", $doc->{npages}, "\n";
1090    print "App: ", $doc->{app}, "\n" if $doc->{app} ne "";
1091
1092    print "\n";
1093    for $i (1..$doc->{npages}) {
1094        $page = $doc->{pages}->{$i};
1095
1096        print "Page $page->{num}:\n";
1097        printf ("  text region: %.2fin x %.2fin\n", $page->{pagespec}->{textbb}->{width}, $page->{pagespec}->{textbb}->{height});
1098
1099        $left_i = $page->{pagespec}->{textbb}->{left};
1100        $right_i = $page->{pagespec}->{paperbb}->{width} -
1101            ($left_i + $page->{pagespec}->{textbb}->{width});
1102        $top_i = $page->{pagespec}->{textbb}->{top};
1103        $bot_i = $page->{pagespec}->{paperbb}->{height} -
1104            ($top_i + $page->{pagespec}->{textbb}->{height});
1105        printf "  margins: %.2fin x %.2fin x %.2fin x %.2fin (l/r/t/b)\n",
1106               $left_i, $right_i, $top_i, $bot_i;
1107
1108        printf "  body font: %gpt (id %d)\n", $page->{pagespec}->{bodyfont},
1109               $page->{pagedata}->{bodyfont}->{id};
1110        printf "  leading: %gpt\n", $page->{pagespec}->{lead};
1111        printf "  columns: %d\n", $page->{pagespec}->{ncols};
1112        print   "  type: ", $page->{pagespec}->{type}, "\n";
1113
1114        $density = $page->{pagespec}->{density};
1115        printf "  density: %d\n", $density;
1116    }
1117}
1118
1119sub report_stats ($) {
1120    my ($doc) = @_;
1121    my ($page) = $doc->{pages}->{1};
1122
1123    if (p2h_serious_font_bug ($doc)) {
1124        print STDERR "$banal_filename: Error: pdftohtml encountered font problems...skipping.\n";
1125        return;
1126    }
1127
1128    if (p2h_font_bug ($doc)) {
1129        print STDERR "$banal_filename: Warning: pdftohtml encountered font problems...some info likely bogus.\n";
1130    }
1131
1132    printf  "$file\t%.2fx%.2f\t%.2fx%.2f\t%.2fx%.2fx%.2fx%.2f\t%d\t%.1f\t%d\t%d\t%s\n",
1133            # page width x height
1134            $doc->{pagespec}->{paperbb}->{width},
1135            $doc->{pagespec}->{paperbb}->{height},
1136            # text region width x height
1137            $doc->{textbb}->{width},
1138            $doc->{textbb}->{height},
1139            # margins left x right x top x bottom
1140            $doc->{textbb}->{left},
1141            $doc->{textbb}->{rmarg},
1142            $doc->{textbb}->{top},
1143            $doc->{textbb}->{bmarg},
1144            # body font
1145            $doc->{pagespec}->{bodyfont},
1146            # leading
1147            $doc->{pagespec}->{lead},
1148            # columns
1149            $doc->{pagespec}->{ncols},
1150            # pages
1151            $doc->{npages},
1152            # app
1153            $doc->{app};
1154}
1155
1156sub judge_paper_size ($$) {
1157    my ($doc, $spec) = @_;
1158    my ($msg) = '';
1159    my ($w, $h);
1160
1161    $w = $doc->{pagespec}->{paperbb}->{width};
1162    $h = $doc->{pagespec}->{paperbb}->{height};
1163    if ($spec->{paper} eq 'letter') {
1164        $paperw = 8.5;
1165        $paperh = 11;
1166    } elsif ($spec->{paper} eq 'A4') {
1167        $paperw = 8.26;
1168        $paperh = 11.69;
1169    }
1170
1171    unless (((($paperw - $banal_text_fudge) < $w) &&
1172             (($paperw + $banal_text_fudge) > $w)) &&
1173            ((($paperh - $banal_text_fudge) < $h) &&
1174             (($paperh + $banal_text_fudge) > $h))) {
1175        $msg = sprintf ("Paper size: %.2f x %.2f is not $spec->{paper} size\n",
1176                        $w, $h);
1177    }
1178
1179    return $msg;
1180}
1181
1182sub judge_page_count ($$) {
1183    my ($doc, $spec) = @_;
1184    my ($msg) = '';
1185
1186    if ($doc->{npages} > $spec->{pages}) {
1187        $msg = sprintf ("Pages: too many pages %d (max %d)\n",
1188                        $doc->{npages}, $spec->{pages});
1189    } elsif ($spec->{min_pages} &&
1190             ($doc->{npages} < $spec->{min_pages})) {
1191        $msg = sprintf ("Pages: too few pages %d (min %d)\n",
1192                        $doc->{npages}, $spec->{min_pages});
1193    }
1194
1195    return $msg;
1196}
1197
1198sub judge_body_font ($$) {
1199    my ($doc, $spec) = @_;
1200    my ($msg) = '';
1201    my ($i, $font);
1202
1203    if (p2h_font_bug ($doc)) {
1204        $msg .= "Font: Cannot judge, no font info derived from pdf\n";
1205        return $msg;
1206    }
1207
1208    if ($doc->{pagespec}->{bodyfont} < $spec->{font}) {
1209        $msg .= sprintf ("Font: body font too small %dpt (min %dpt)\n",
1210                         $doc->{pagespec}->{bodyfont}, $spec->{font});
1211    }
1212    return $msg;
1213}
1214
1215sub app_msg ($) {
1216    my ($doc) = @_;
1217    return ($doc->{app} ne "" ? " using " . $doc->{app} : "");
1218}
1219
1220sub judge_leading ($$) {
1221    my ($doc, $spec) = @_;
1222    my ($msg) = '';
1223    my ($lead);
1224
1225    $lead = $doc->{pagespec}->{lead};
1226    if (($spec->{lead} - 0.1) > $lead) {
1227        $msg .= sprintf ("Leading: too small %.1fpt (min %.1fpt)%s\n",
1228                         $lead, $spec->{lead}, app_msg($doc));
1229    }
1230}
1231
1232sub judge_columns ($$) {
1233    my ($doc, $spec) = @_;
1234    my ($msg) = '';
1235    my ($i, $page);
1236
1237    # should add a 'strict' option
1238    if ($doc->{ncols} > $spec->{cols}) {
1239        $msg = sprintf ("Columns: found %d columns, expecting %d\n",
1240                        $doc->{ncols}, $spec->{cols});
1241    }
1242
1243    return $msg if (1);
1244
1245    # skip last page
1246    for $i (1..($doc->{npages} - 1)) {
1247        $page = $doc->{pages}->{$i};
1248
1249        next if ($page->{pagespec}->{density} < $banal_min_density);
1250
1251        next unless ($spec->{cols} != $page->{pagespec}->{ncols});
1252
1253        $msg = sprintf ("Columns: found %d columns, expecting %d\n",
1254                        $page->{pagespec}->{ncols}, $spec->{cols});
1255        last;
1256    }
1257
1258    return $msg;
1259}
1260
1261sub judge_text_region ($$) {
1262    my ($doc, $spec) = @_;
1263    my ($wmsg, $hmsg) = ('', '');
1264    my ($i, $page);
1265    my ($width, $height, $width_fail, $height_fail);
1266
1267    $width_fail = 0;
1268    for $i (1..$doc->{npages}) {
1269        $page = $doc->{pages}->{$i};
1270
1271        # ignore pages without much text
1272        next if ($page->{pagespec}->{density} < $banal_min_density);
1273
1274        $width = $page->{pagespec}->{textbb}->{width};
1275        next unless ($spec->{width} &&
1276                     ($width > ($spec->{width} + $spec->{fudge})));
1277        $width_fail++;
1278
1279        $wmsg = sprintf ("Width: text too wide %.2fin (max %.2fin)\n",
1280                         $width, $spec->{width});
1281    }
1282
1283    # if a small number of pages fail the width spec, it is likely
1284    # due to tables or figures extending into the margin.
1285    # only check on reasonably long docs.
1286    if ($doc->{npages} > (($banal_judge_min_fail_pages - 1) * 2)) {
1287        if ($width_fail < $banal_judge_min_fail_pages) {
1288            $wmsg = '';
1289        }
1290    }
1291
1292
1293    $height_fail = 0;
1294    for $i (1..$doc->{npages}) {
1295        $page = $doc->{pages}->{$i};
1296
1297        next if ($page->{pagespec}->{density} < $banal_min_density);
1298
1299        $height = $page->{pagespec}->{textbb}->{height};
1300        next unless ($spec->{height} &&
1301                     ($height > ($spec->{height} + $spec->{fudge})));
1302        $height_fail++;
1303
1304        $hmsg = sprintf ("Height: text too high %.2fin (max %.2fin)\n",
1305                         $height, $spec->{height});
1306    }
1307
1308    # if a small number of pages fail the height spec, it is likely
1309    # due to tables or figures extending into the margin.
1310    # only check on reasonably long docs.
1311    if ($doc->{npages} > (($banal_judge_min_fail_pages - 1) * 2)) {
1312        if ($height_fail < $banal_judge_min_fail_pages) {
1313            $hmsg = '';
1314        }
1315    }
1316
1317#    $hmsg .= sprintf ("Fail: width $width_fail height $height_fail\n");
1318
1319    return $wmsg . $hmsg;
1320}
1321
1322sub pass_judgement ($$) {
1323    my ($doc, $spec) = @_;
1324    my ($page);
1325    my ($msg) = '';
1326    my ($err);
1327
1328    if (p2h_serious_font_bug ($doc)) {
1329        print STDERR "$banal_filename: Error: pdftohtml encountered font problems...skipping.\n";
1330        return;
1331    }
1332
1333    $msg .= judge_paper_size ($doc, $spec) if ($spec->{paper});
1334    $msg .= judge_page_count ($doc, $spec) if ($spec->{pages});
1335    $msg .= judge_body_font ($doc, $spec) if ($spec->{font});
1336    $msg .= judge_leading ($doc, $spec) if ($spec->{lead});
1337    $msg .= judge_columns ($doc, $spec) if ($spec->{cols});
1338    $msg .= judge_text_region ($doc, $spec) if ($spec->{width} || $spec->{height});
1339
1340    return if (!$msg);
1341
1342    if ($format eq 'list') {
1343        chop $msg;         # remove trailing newline
1344        $msg =~ s/\n/,/g;  # convert newlines to commas
1345        print basename ($file), ",$msg\n";
1346    } else {
1347        $msg =~ s/^(.)/  $1/mg;  # indent
1348        print $file, ":\n";
1349        print $msg;
1350    }
1351}
1352
1353sub parse_p2h_fonts ($$) {
1354    my ($line, $page) = @_;
1355    my (%fonts, $font, $fontid);
1356
1357    while (1) {
1358#       print "p2h_font: $line";
1359        return $line if ($line =~ /<\/page>/);
1360
1361        last unless ($line =~ /<fontspec id=\"(\d+)\" size=\"([-]*\d+)\" family=\"([A-Za-z0-9]+)\" color=\"(\#[a-fA-F0-9]+)\"\/>/);
1362
1363        $font = { id => $1, size => $2, family => $3, color => $4 };
1364        $fontid = "$3//$2//$4";
1365        if (exists $fonts{$fontid}) {
1366            $font->{id} = $fonts{$fontid};
1367        } else {
1368            $fonts{$fontid} = $1;
1369        }
1370        $page->{doc}->{fonts}{$1} = $font;
1371
1372        $line = <$FILE>;
1373    }
1374
1375    return $line;
1376}
1377
1378sub update_segdata ($$$) {
1379    my ($page, $segdata, $seg) = @_;
1380    my ($top, $left, $width, $height, $font, $lead) = @$seg;
1381    my ($bottom) = $top + $height;
1382    my ($right) = $left + $width;
1383    my ($pagew) = $page->{pagedata}->{pagebb}->{width};
1384
1385    $segdata->{widths}{$width}++ if ($width > $p2h_per_inch);
1386    $segdata->{lefts}{$left}++ if ($left < ($pagew / 3));
1387    $segdata->{rights}{$right}++ if ($right > ($pagew / 3));
1388    $segdata->{tops}{$top}++ if ($width > $p2h_per_inch);
1389    $segdata->{bots}{$bottom}++ if ($width > $p2h_per_inch);
1390#    $segdata->{leads}{$lead}++ if ($lead > 0 && $width > $p2h_per_inch);
1391    $segdata->{leads}{$lead}++ if ($lead > 0);
1392
1393    # count number of segments in a given font size, weighted by the
1394    # width of the segment.  the font with the greatest weight
1395    # will be the body font.
1396
1397    $segdata->{byfont}{$font} += $width;
1398}
1399
1400sub check_p2h_error ($) {
1401    my ($line) = @_;
1402
1403    # check for pdftohtml error strings embedded in output
1404    return 1 if ($line =~ /^stroke seems to be a pattern/);
1405
1406    return 0;
1407}
1408
1409sub parse_p2h_text ($$) {
1410    my ($line, $page) = @_;
1411    my ($top, $bottom, $left, $right, $width, $height, $font);
1412    my ($text, $lead, $prevheight);
1413
1414    $segs_all = {};
1415    $segs_byfont = {};
1416
1417    $prevheight = 0;
1418    $lowest_number = 0;
1419
1420    while (1) {
1421#       next if (check_p2h_error ($line));
1422
1423        unless ($line =~ /<text top=\"(-?\d+)\" left=\"(-?\d+)\" width=\"(-?\d+)\" height=\"(-?\d+)\" font=\"(-?\d+)\"/) {
1424            # if we didn't match a <text>, then it should be an end of page or <image>
1425            if ($line =~ /<image/) {
1426                $line = <$FILE>;
1427                next;
1428            }
1429            unless ($line =~ /<\/page>/) {
1430                if ($debug_parse) {
1431                    print STDERR "$banal_filename: Curious, expecting a </page> but found:\n";
1432                    print STDERR $line;
1433                }
1434            }
1435            last;
1436        }
1437
1438        $height = $1;
1439        if ($prevheight < $height) {
1440            $lead = $height - $prevheight;
1441        } else {
1442            $lead = -1;
1443        }
1444        $prevheight = $height;
1445
1446        @seginfo = ($1, $2, $3, $4, $5, $lead);
1447        if (($font = $page->{doc}->{fonts}{$5})) {
1448            $seginfo[4] = $font->{id};
1449        }
1450
1451        # sanity check the data somewhat...text from embedded figures
1452        # can produce surprising values
1453        if ($1 < 0 || $2 < 0 ||
1454            ($1 > $page->{pagedata}->{pagebb}->{height}) ||
1455            ($2 > $page->{pagedata}->{pagebb}->{width})) {
1456            $line = <$FILE>;
1457            next;
1458        }
1459
1460        $nsegs++;
1461
1462        $segs_byfont->{$seginfo[4]} = {}
1463           unless (defined $segs_byfont->{$seginfo[4]});
1464        $byfont = $segs_byfont->{$seginfo[4]};
1465        update_segdata ($page, $byfont, \@seginfo);
1466        $segs_byfont{$seginfo[4]} = $byfont;
1467        update_segdata ($page, $segs_all, \@seginfo);
1468
1469        # page number detection
1470        if ($line =~ /<text[^>]*>[- ,.\/]*[0-9][- ,.\/0-9]*<\/text>/) {
1471            my($bottom) = $seginfo[0] + $seginfo[3];
1472            $lowest_number = max $lowest_number, $bottom;
1473        }
1474
1475        # embedded newlines will split <text>...</text> across multiple lines
1476        if ($line !~ /<\/text>/) {
1477            while ($line = <$FILE>) {
1478                print STDERR "$banal_filename: skipping: $line" if ($debug_parse);
1479                last if ($line  =~/<\/text>/);
1480            }
1481        }
1482
1483        $line = <$FILE>;
1484    }
1485
1486
1487    $page->{pagedata}->{nsegs} = $nsegs;
1488    $page->{pagedata}->{segdata} = $segs_all;
1489    $page->{pagedata}->{segdata_byfont} = $segs_byfont;
1490    $page->{pagedata}->{lowest_number} = $lowest_number;
1491
1492    calc_page_body_font ($page);
1493    calc_page_leading ($page);
1494    calc_page_density ($page);
1495    calc_page_text_region ($page, $segs_all);
1496    calc_page_columns ($page);
1497}
1498
1499sub parse_p2h_page ($) {
1500    my ($doc) = @_;
1501
1502    # assume we've just read the header
1503    $line = <$FILE>;
1504
1505    # skip any error strings embedded between pages
1506    while (check_p2h_error ($line)) {
1507        print STDERR "$banal_filename: skipping p2h error string: $line" if ($debug_parse);
1508        $line = <$FILE>;
1509    }
1510
1511    if ($line !~ /<page/ && $line =~ /<outline>/) {
1512        my($nout) = 0;
1513        while (1) {
1514            ++$nout if $line =~ /<outline>/;
1515            --$nout if $line =~ /<\/outline>/;
1516            last if $nout == 0;
1517            $line = <$FILE>;
1518        }
1519        $line = <$FILE> if $line =~ /<\/outline>\s*$/;
1520    }
1521
1522    unless ($line =~ /<page number=\"(\d+)\" position=\"([A-Za-z0-9]+\") top=\"(\d+)\" left=\"(\d+)\" height=\"(\d+)\" width=\"(\d+)\"/) {
1523        return '' if ($line =~ /<\/pdf2xml/);
1524        print STDERR "$banal_filename: Error: \"<page ...\" node expected for page ", $doc->{npages} + 1, "\n";
1525        chomp $line;
1526        print STDERR "-> '$line'\n";
1527        return '';
1528    }
1529
1530    # initialize page data structures
1531    $pagebb = {
1532        top => $3,
1533        left => $4,
1534        height => $5,
1535        width => $6,
1536    };
1537
1538    $paperbb = {
1539        top => $3 / $p2h_per_inch,
1540        left => $4 / $p2h_per_inch,
1541        height => $5 / $p2h_per_inch,
1542        width => $6 / $p2h_per_inch,
1543    };
1544
1545    $page = {
1546        doc => $doc,
1547        num => $1,
1548        pagedata => {
1549            pagebb => $pagebb,
1550        },
1551        pagespec => {
1552            paperbb => $paperbb,
1553        },
1554    };
1555
1556    # check for optional fontspecs at start of page
1557    $line = <$FILE>;
1558    if ($line =~ /<fontspec/) {
1559        $line = parse_p2h_fonts ($line, $page);
1560    } elsif ($debug_parse) {
1561        print STDERR "$banal_filename: Curious, no fontspec on page, found:\n";
1562        print STDERR "$line";
1563    }
1564
1565
1566    # process text segments
1567    if ($line =~ /<(?:text|image)/) {
1568        parse_p2h_text ($line, $page);
1569    } elsif ($debug_parse) {
1570        print STDERR "$banal_filename: Curious, empty page $page->{num}, found:\n";
1571        print STDERR "$line";
1572    }
1573
1574    return $page;
1575}
1576
1577sub parse_p2h_header ($) {
1578    my ($doc) = @_;
1579
1580    while (<$FILE>) {
1581        return 1 if (/<pdf2xml/);
1582    }
1583    return 0;
1584}
1585
1586sub merge_page ($$) {
1587    my ($doc, $page) = @_;
1588
1589    $doc->{npages}++;
1590    $doc->{pages}->{$page->{num}} = $page;
1591
1592    # initialize doc spec with first page spec
1593    if ($page->{num} == 1) {
1594        $doc->{pagespec}->{paperbb} = $page->{pagespec}->{paperbb};
1595        $doc->{pagespec}->{textbb} = $page->{pagespec}->{textbb};
1596        $doc->{pagespec}->{bodyfont} = $page->{pagespec}->{bodyfont};
1597        $doc->{pagespec}->{ncols} = $page->{pagespec}->{ncols};
1598        return;
1599    }
1600}
1601
1602sub banal_get_spec () {
1603    my ($s) = {};
1604
1605    return $s unless (defined $judge);
1606
1607    if (defined $paper) {
1608        if ($paper ne 'letter' && $paper ne 'A4') {
1609            die ("$banal_filename: Error: Unknown paper type '$paper'.\n");
1610        }
1611        $s->{paper} = $paper;
1612    }
1613    $s->{pages} = $pages if (defined $pages);
1614    $s->{font} = $font if (defined $font);
1615    $s->{lead} = $leading if (defined $leading);
1616    $s->{cols} = $cols if (defined $cols);
1617    if (defined $width) {
1618        $s->{width} = $width;
1619        $s->{fudge} = $banal_text_fudge;
1620    }
1621    if (defined $height) {
1622        $s->{height} = $height;
1623        $s->{fudge} = $banal_text_fudge;
1624    }
1625    if (defined $fudge) {
1626        $s->{fudge} = $fudge;
1627    }
1628    return $s;
1629}
1630
1631sub banal_report_spec ($) {
1632    my ($spec) = @_;
1633
1634    print "Judging: ";
1635    print "$spec->{paper}, " if ($spec->{paper});
1636    print "$spec->{width}in x $spec->{height}in (~$spec->{fudge}), " if ($spec->{width} || $spec->{height});
1637    print "$spec->{font}pt font, " if ($spec->{font});
1638    print "$spec->{lead}pt leading, " if ($spec->{lead});
1639    print "$spec->{cols} cols, " if ($spec->{cols});
1640    print "$spec->{pages} pages" if ($spec->{pages});
1641    print "\n";
1642    print "-   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   -   \n";
1643}
1644
1645sub banal_file ($$) {
1646    my ($file, $spec) = @_;
1647
1648    # initialize doc data structure
1649    $doc = {
1650        width => 0,
1651        height => 0,
1652        npages => 0,
1653        ncols => 0,
1654        fonts => {},
1655        pages => {},
1656        textbb => {},
1657        app => '',
1658        fullpath => '',
1659        filename => '',
1660    };
1661
1662    $doc->{fullpath} = $file;
1663    $banal_fullpath = $file;
1664    $doc->{filename} = basename ($file);
1665    $banal_filename = basename ($file);
1666
1667    if (!parse_p2h_header ($doc)) {
1668        print STDERR "$banal_filename: Error: No pdftohtml output...corrupted pdf file?\n";
1669        return;
1670    }
1671
1672    calc_doc_app ($doc) if !$no_app;
1673
1674    while ($page = parse_p2h_page ($doc)) {
1675        merge_page ($doc, $page);
1676    }
1677
1678    calc_doc_body_font ($doc);
1679    calc_doc_leading ($doc);
1680    calc_doc_text_region ($doc);
1681    calc_doc_page_types ($doc);
1682    calc_doc_columns ($doc);
1683
1684    if (defined $judge) {
1685        pass_judgement ($doc, $spec);
1686    } elsif (defined $stats) {
1687        report_stats ($doc);
1688    } elsif (defined $json) {
1689        report_json ($doc);
1690    } else {
1691        report_verbose ($doc);
1692    }
1693}
1694
1695sub shell_quote ($) {
1696    my($s) = @_;
1697    $s =~ s/\'/\'\"\'\"\'/g;
1698    return "'$s'";
1699}
1700
1701sub banal_open_input ($) {
1702    my ($fname) = @_;
1703    my ($base, $ext, $cmd, $oname);
1704
1705    if ($fname =~ /(.+)\.(.+)/) {
1706        ($base, $ext) = ($1, $2);
1707    } else {
1708        print STDERR "$fname: Error: Unable to determine file type from extension.\n";
1709        return 0;
1710    }
1711
1712    # 2>&1
1713    if ($ext =~ /^pdf$/i) {
1714        ($FILE, $oname) = File::Temp::tempfile("banalXXXXX", UNLINK => 1, SUFFIX => ".xml", TMPDIR => 1);
1715        $zoomarg = "-zoom $zoom";
1716        $cmd = "$pdftohtml -enc UTF-8 -xml -i $zoomarg " . shell_quote($fname) . " " . shell_quote($oname) . " 2>&1";
1717        print STDERR "$cmd\n" if ($debug_pdftohtml);
1718
1719        my($ignore_output) = `$cmd`;
1720        unless (-s $FILE) {
1721            print STDERR "$fname: Error: Failed to open file.\n";
1722            return 0;
1723        }
1724    } elsif ($ext =~ /^xml$/i) {
1725        unless (open ($FILE, "$fname")) {
1726            print STDERR "$fname: Error: Failed to open file.\n";
1727            return 0;
1728        }
1729    } else {
1730        print STDERR "$fname: Error: Failed to open file.\n";
1731        return 0;
1732    }
1733    binmode ($FILE, ":utf8");
1734
1735    return 1;
1736}
1737
1738sub banal_config_p2h ($) {
1739    my ($fname) = @_;
1740    $fname = basename($fname);
1741    my ($major, $minor, $poppler);
1742
1743    if (!defined($zoom)) {
1744        unless (open(P2H, "$pdftohtml -v 2>&1 |")) {
1745            print STDERR "$fname: Error: Failed to run $pdftohtml.\n";
1746            while (defined($_ = <P2H>)) {
1747                print STDERR;
1748            }
1749            return 0;
1750        }
1751        while (defined($_ = <P2H>)) {
1752            $poppler = 1 if /Poppler/;
1753            next unless (/pdftohtml version (\d+\.\d+)([a-z]*)/);
1754            $p2h_version = "$1$2";
1755            $major = $1;
1756            $minor = $2;
1757        }
1758        close (P2H);
1759
1760        if (($major >= 0.40) && $minor && (($minor cmp "c") >= 0)) {
1761            # configure for versions 0.40c and above
1762            $zoom = 10;
1763        } else {
1764            $zoom = 3;
1765        }
1766    }
1767
1768    if ($leading_policy) {
1769        $banal_leading_policy = $leading_policy;
1770    } else {
1771        # use a default policy according to the zoom level we can use
1772        # at low zoom, interpolate
1773        if ($zoom >= 10) {
1774            $banal_leading_policy = 'mode';
1775        } else {
1776            $banal_leading_policy = 'interpolate';
1777        }
1778    }
1779
1780    print "leading policy: $banal_leading_policy\n" if ($debug_leading);
1781
1782
1783    $p2h_per_inch = 72 * $zoom;
1784    $p2h_to_points = 72 / $p2h_per_inch;
1785
1786    return 1;
1787}
1788
1789sub banal_version () {
1790    print "Banal version $banal_version.\n";
1791    return 0;
1792}
1793
1794sub main () {
1795    my ($spec);
1796
1797    return banal_version () if (defined $version);
1798
1799    usage if ($#ARGV < 0);
1800
1801    $spec = banal_get_spec ();
1802    banal_report_spec ($spec) if (defined $judge);
1803
1804    if (!banal_config_p2h ($ARGV[0])) {
1805        return 1;
1806    }
1807
1808    foreach $file (@ARGV) {
1809        # open input file into FILE
1810        next unless (banal_open_input ($file));
1811        banal_file ($file, $spec);
1812        close $FILE;
1813    }
1814    return 0;
1815}
1816
1817exit (main ());
1818
1819#
1820# 2011-1-25
1821#    (utf8revbin_undo): new function; a tool incorrectly reverses multibytes
1822#
1823# 2011-1-19
1824#    (check_p2h_error): skip 'stroke seems...' pdftohtml output that can
1825#    appear between page output.
1826#
1827# 2011-1-18
1828#    (parse_p2h_page, parse_p2h_text, parse_p2h_fonts): handle <text>
1829#    segments that span multiple lines from embedded newlines accurately.
1830#    handle optional <fontspecc> commands more gracefully.
1831#    (debug_parse): new flag.
1832#
1833# 2011-1-17
1834#    (update_segdata): fix reporting negative leadings.
1835#
1836# 2011-1-11
1837#    (utf8ascii_undo, utf8bin_undo, utf8hex_undo, calc_doc_app): new functions
1838#    for inferring the application used to create the document.
1839#    (report_verbose, report_stats, judge_leading): report doc application.
1840#    (debug_docapp): new flag.
1841#
1842# 2011-1-07
1843#    uniformly print filename in error messages.
1844#
1845# 2010-12-31
1846#
1847#    (judge_format): new flag, option 'list' reports all violations
1848#    on a single line in CSV format.  default option 'lines' is original
1849#    behavior with one per line.
1850#
1851