1 //========================================================================
2 //
3 // HtmlOutputDev.cc
4 //
5 // Copyright 1997-2002 Glyph & Cog, LLC
6 //
7 // Changed 1999-2000 by G.Ovtcharov
8 //
9 // Changed 2002 by Mikhail Kruk
10 //
11 //========================================================================
12 
13 //========================================================================
14 //
15 // Modified under the Poppler project - http://poppler.freedesktop.org
16 //
17 // All changes made under the Poppler project to this file are licensed
18 // under GPL version 2 or later
19 //
20 // Copyright (C) 2005-2013, 2016-2021 Albert Astals Cid <aacid@kde.org>
21 // Copyright (C) 2008 Kjartan Maraas <kmaraas@gnome.org>
22 // Copyright (C) 2008 Boris Toloknov <tlknv@yandex.ru>
23 // Copyright (C) 2008 Haruyuki Kawabe <Haruyuki.Kawabe@unisys.co.jp>
24 // Copyright (C) 2008 Tomas Are Haavet <tomasare@gmail.com>
25 // Copyright (C) 2009 Warren Toomey <wkt@tuhs.org>
26 // Copyright (C) 2009, 2011 Carlos Garcia Campos <carlosgc@gnome.org>
27 // Copyright (C) 2009 Reece Dunn <msclrhd@gmail.com>
28 // Copyright (C) 2010, 2012, 2013 Adrian Johnson <ajohnson@redneon.com>
29 // Copyright (C) 2010 Hib Eris <hib@hiberis.nl>
30 // Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey (leenac@cdacmumbai.in) and Onkar Potdar (onkar@cdacmumbai.in)
31 // Copyright (C) 2011 Joshua Richardson <jric@chegg.com>
32 // Copyright (C) 2011 Stephen Reichling <sreichling@chegg.com>
33 // Copyright (C) 2011, 2012 Igor Slepchin <igor.slepchin@gmail.com>
34 // Copyright (C) 2012 Ihar Filipau <thephilips@gmail.com>
35 // Copyright (C) 2012 Gerald Schmidt <solahcin@gmail.com>
36 // Copyright (C) 2012 Pino Toscano <pino@kde.org>
37 // Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de>
38 // Copyright (C) 2013 Julien Nabet <serval2412@yahoo.fr>
39 // Copyright (C) 2013 Johannes Brandstätter <jbrandstaetter@gmail.com>
40 // Copyright (C) 2014 Fabio D'Urso <fabiodurso@hotmail.it>
41 // Copyright (C) 2016 Vincent Le Garrec <legarrec.vincent@gmail.com>
42 // Copyright (C) 2017 Caolán McNamara <caolanm@redhat.com>
43 // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
44 // Copyright (C) 2018 Thibaut Brard <thibaut.brard@gmail.com>
45 // Copyright (C) 2018-2020 Adam Reichold <adam.reichold@t-online.de>
46 // Copyright (C) 2019, 2020 Oliver Sander <oliver.sander@tu-dresden.de>
47 // Copyright (C) 2020 Eddie Kohler <ekohler@gmail.com>
48 // Copyright (C) 2021 Christopher Hasse <hasse.christopher@gmail.com>
49 //
50 // To see a description of the changes please see the Changelog file that
51 // came with your tarball or type make ChangeLog if you are building from git
52 //
53 //========================================================================
54 
55 #include "config.h"
56 #include <cstdio>
57 #include <cstdlib>
58 #include <cstdarg>
59 #include <cstddef>
60 #include <cctype>
61 #include <cmath>
62 #include <iostream>
63 #include "goo/GooString.h"
64 #include "goo/gbasename.h"
65 #include "goo/gbase64.h"
66 #include "goo/gbasename.h"
67 #include "UnicodeMap.h"
68 #include "goo/gmem.h"
69 #include "Error.h"
70 #include "GfxState.h"
71 #include "Page.h"
72 #include "Annot.h"
73 #include "PNGWriter.h"
74 #include "GlobalParams.h"
75 #include "HtmlOutputDev.h"
76 #include "HtmlFonts.h"
77 #include "HtmlUtils.h"
78 #include "InMemoryFile.h"
79 #include "Outline.h"
80 #include "PDFDoc.h"
81 
82 #ifdef ENABLE_LIBPNG
83 #    include <png.h>
84 #endif
85 
86 #define DEBUG __FILE__ << ": " << __LINE__ << ": DEBUG: "
87 
88 class HtmlImage
89 {
90 public:
HtmlImage(GooString * _fName,GfxState * state)91     HtmlImage(GooString *_fName, GfxState *state) : fName(_fName)
92     {
93         state->transform(0, 0, &xMin, &yMax);
94         state->transform(1, 1, &xMax, &yMin);
95     }
~HtmlImage()96     ~HtmlImage() { delete fName; }
97     HtmlImage(const HtmlImage &) = delete;
98     HtmlImage &operator=(const HtmlImage &) = delete;
99 
100     double xMin, xMax; // image x coordinates
101     double yMin, yMax; // image y coordinates
102     GooString *fName; // image file name
103 };
104 
105 // returns true if x is closer to y than x is to z
IS_CLOSER(float x,float y,float z)106 static inline bool IS_CLOSER(float x, float y, float z)
107 {
108     return std::fabs((x) - (y)) < std::fabs((x) - (z));
109 }
110 
111 extern bool complexMode;
112 extern bool singleHtml;
113 extern bool dataUrls;
114 extern bool ignore;
115 extern bool printCommands;
116 extern bool printHtml;
117 extern bool noframes;
118 extern bool stout;
119 extern bool xml;
120 extern bool noRoundedCoordinates;
121 extern bool showHidden;
122 extern bool noMerge;
123 
124 extern double wordBreakThreshold;
125 
126 static bool debug = false;
127 static GooString *gstr_buff0 = nullptr; // a workspace in which I format strings
128 
129 #if 0
130 static GooString* Dirname(GooString* str){
131 
132   char *p=str->c_str();
133   int len=str->getLength();
134   for (int i=len-1;i>=0;i--)
135     if (*(p+i)==SLASH)
136       return new GooString(p,i+1);
137   return new GooString();
138 }
139 #endif
140 
print_matrix(const double * mat)141 static const char *print_matrix(const double *mat)
142 {
143     delete gstr_buff0;
144 
145     gstr_buff0 = GooString::format("[{0:g} {1:g} {2:g} {3:g} {4:g} {5:g}]", *mat, mat[1], mat[2], mat[3], mat[4], mat[5]);
146     return gstr_buff0->c_str();
147 }
148 
print_uni_str(const Unicode * u,const unsigned uLen)149 static const char *print_uni_str(const Unicode *u, const unsigned uLen)
150 {
151     GooString *gstr_buff1 = nullptr;
152 
153     delete gstr_buff0;
154 
155     if (!uLen)
156         return "";
157     gstr_buff0 = GooString::format("{0:c}", (*u < 0x7F ? *u & 0xFF : '?'));
158     for (unsigned i = 1; i < uLen; i++) {
159         if (u[i] < 0x7F) {
160             gstr_buff1 = gstr_buff0->append(u[i] < 0x7F ? static_cast<char>(u[i]) & 0xFF : '?');
161             delete gstr_buff0;
162             gstr_buff0 = gstr_buff1;
163         }
164     }
165 
166     return gstr_buff0->c_str();
167 }
168 
169 //------------------------------------------------------------------------
170 // HtmlString
171 //------------------------------------------------------------------------
172 
HtmlString(GfxState * state,double fontSize,HtmlFontAccu * _fonts)173 HtmlString::HtmlString(GfxState *state, double fontSize, HtmlFontAccu *_fonts) : fonts(_fonts)
174 {
175     GfxFont *font;
176     double x, y;
177 
178     state->transform(state->getCurX(), state->getCurY(), &x, &y);
179     if ((font = state->getFont())) {
180         double ascent = font->getAscent();
181         double descent = font->getDescent();
182         if (ascent > 1.05) {
183             // printf( "ascent=%.15g is too high, descent=%.15g\n", ascent, descent );
184             ascent = 1.05;
185         }
186         if (descent < -0.4) {
187             // printf( "descent %.15g is too low, ascent=%.15g\n", descent, ascent );
188             descent = -0.4;
189         }
190         yMin = y - ascent * fontSize;
191         yMax = y - descent * fontSize;
192         GfxRGB rgb;
193         state->getFillRGB(&rgb);
194         HtmlFont hfont = HtmlFont(font, static_cast<int>(fontSize), rgb, state->getFillOpacity());
195         if (isMatRotOrSkew(state->getTextMat())) {
196             double normalizedMatrix[4];
197             memcpy(normalizedMatrix, state->getTextMat(), sizeof(normalizedMatrix));
198             // browser rotates the opposite way
199             // so flip the sign of the angle -> sin() components change sign
200             if (debug)
201                 std::cerr << DEBUG << "before transform: " << print_matrix(normalizedMatrix) << std::endl;
202             normalizedMatrix[1] *= -1;
203             normalizedMatrix[2] *= -1;
204             if (debug)
205                 std::cerr << DEBUG << "after reflecting angle: " << print_matrix(normalizedMatrix) << std::endl;
206             normalizeRotMat(normalizedMatrix);
207             if (debug)
208                 std::cerr << DEBUG << "after norm: " << print_matrix(normalizedMatrix) << std::endl;
209             hfont.setRotMat(normalizedMatrix);
210         }
211         fontpos = fonts->AddFont(hfont);
212     } else {
213         // this means that the PDF file draws text without a current font,
214         // which should never happen
215         yMin = y - 0.95 * fontSize;
216         yMax = y + 0.35 * fontSize;
217         fontpos = 0;
218     }
219     if (yMin == yMax) {
220         // this is a sanity check for a case that shouldn't happen -- but
221         // if it does happen, we want to avoid dividing by zero later
222         yMin = y;
223         yMax = y + 1;
224     }
225     col = 0;
226     text = nullptr;
227     xRight = nullptr;
228     link = nullptr;
229     len = size = 0;
230     yxNext = nullptr;
231     xyNext = nullptr;
232     htext = new GooString();
233     dir = textDirUnknown;
234 }
235 
~HtmlString()236 HtmlString::~HtmlString()
237 {
238     gfree(text);
239     delete htext;
240     gfree(xRight);
241 }
242 
addChar(GfxState * state,double x,double y,double dx,double dy,Unicode u)243 void HtmlString::addChar(GfxState *state, double x, double y, double dx, double dy, Unicode u)
244 {
245     if (dir == textDirUnknown) {
246         // dir = UnicodeMap::getDirection(u);
247         dir = textDirLeftRight;
248     }
249 
250     if (len == size) {
251         size += 16;
252         text = (Unicode *)grealloc(text, size * sizeof(Unicode));
253         xRight = (double *)grealloc(xRight, size * sizeof(double));
254     }
255     text[len] = u;
256     if (len == 0) {
257         xMin = x;
258     }
259     xMax = xRight[len] = x + dx;
260     // printf("added char: %f %f xright = %f\n", x, dx, x+dx);
261     ++len;
262 }
263 
endString()264 void HtmlString::endString()
265 {
266     if (dir == textDirRightLeft && len > 1) {
267         // printf("will reverse!\n");
268         for (int i = 0; i < len / 2; i++) {
269             Unicode ch = text[i];
270             text[i] = text[len - i - 1];
271             text[len - i - 1] = ch;
272         }
273     }
274 }
275 
276 //------------------------------------------------------------------------
277 // HtmlPage
278 //------------------------------------------------------------------------
279 
HtmlPage(bool rawOrderA)280 HtmlPage::HtmlPage(bool rawOrderA)
281 {
282     rawOrder = rawOrderA;
283     curStr = nullptr;
284     yxStrings = nullptr;
285     xyStrings = nullptr;
286     yxCur1 = yxCur2 = nullptr;
287     fonts = new HtmlFontAccu();
288     links = new HtmlLinks();
289     pageWidth = 0;
290     pageHeight = 0;
291     fontsPageMarker = 0;
292     DocName = nullptr;
293     firstPage = -1;
294 }
295 
~HtmlPage()296 HtmlPage::~HtmlPage()
297 {
298     clear();
299     delete DocName;
300     delete fonts;
301     delete links;
302     for (auto entry : imgList) {
303         delete entry;
304     }
305 }
306 
updateFont(GfxState * state)307 void HtmlPage::updateFont(GfxState *state)
308 {
309     GfxFont *font;
310     const char *name;
311     int code;
312     double w;
313 
314     // adjust the font size
315     fontSize = state->getTransformedFontSize();
316     if ((font = state->getFont()) && font->getType() == fontType3) {
317         // This is a hack which makes it possible to deal with some Type 3
318         // fonts.  The problem is that it's impossible to know what the
319         // base coordinate system used in the font is without actually
320         // rendering the font.  This code tries to guess by looking at the
321         // width of the character 'm' (which breaks if the font is a
322         // subset that doesn't contain 'm').
323         for (code = 0; code < 256; ++code) {
324             if ((name = ((Gfx8BitFont *)font)->getCharName(code)) && name[0] == 'm' && name[1] == '\0') {
325                 break;
326             }
327         }
328         if (code < 256) {
329             w = ((Gfx8BitFont *)font)->getWidth(code);
330             if (w != 0) {
331                 // 600 is a generic average 'm' width -- yes, this is a hack
332                 fontSize *= w / 0.6;
333             }
334         }
335         const double *fm = font->getFontMatrix();
336         if (fm[0] != 0) {
337             fontSize *= fabs(fm[3] / fm[0]);
338         }
339     }
340 }
341 
beginString(GfxState * state,const GooString * s)342 void HtmlPage::beginString(GfxState *state, const GooString *s)
343 {
344     curStr = new HtmlString(state, fontSize, fonts);
345 }
346 
conv()347 void HtmlPage::conv()
348 {
349     for (HtmlString *tmp = yxStrings; tmp; tmp = tmp->yxNext) {
350         delete tmp->htext;
351         tmp->htext = HtmlFont::HtmlFilter(tmp->text, tmp->len);
352 
353         int linkIndex = 0;
354         if (links->inLink(tmp->xMin, tmp->yMin, tmp->xMax, tmp->yMax, linkIndex)) {
355             tmp->link = links->getLink(linkIndex);
356         }
357     }
358 }
359 
addChar(GfxState * state,double x,double y,double dx,double dy,double ox,double oy,const Unicode * u,int uLen)360 void HtmlPage::addChar(GfxState *state, double x, double y, double dx, double dy, double ox, double oy, const Unicode *u, int uLen)
361 {
362     double x1, y1, w1, h1, dx2, dy2;
363     int n, i;
364     state->transform(x, y, &x1, &y1);
365     n = curStr->len;
366 
367     // check that new character is in the same direction as current string
368     // and is not too far away from it before adding
369     // if ((UnicodeMap::getDirection(u[0]) != curStr->dir) ||
370     // XXX
371     if (debug) {
372         const double *text_mat = state->getTextMat();
373         // rotation is (cos q, sin q, -sin q, cos q, 0, 0)
374         // sin q is zero iff there is no rotation, or 180 deg. rotation;
375         // for 180 rotation, cos q will be negative
376         if (text_mat[0] < 0 || !is_within(text_mat[1], .1, 0)) {
377             std::cerr << DEBUG << "rotation matrix for \"" << print_uni_str(u, uLen) << '"' << std::endl;
378             std::cerr << "text " << print_matrix(state->getTextMat());
379         }
380     }
381     if (n > 0 && // don't start a new string, unless there is already a string
382                  // TODO: the following line assumes that text is flowing left to
383                  // right, which will not necessarily be the case, e.g. if rotated;
384                  // It assesses whether or not two characters are close enough to
385                  // be part of the same string
386         fabs(x1 - curStr->xRight[n - 1]) > wordBreakThreshold * (curStr->yMax - curStr->yMin) &&
387         // rotation is (cos q, sin q, -sin q, cos q, 0, 0)
388         // sin q is zero iff there is no rotation, or 180 deg. rotation;
389         // for 180 rotation, cos q will be negative
390         !rot_matrices_equal(curStr->getFont().getRotMat(), state->getTextMat())) {
391         endString();
392         beginString(state, nullptr);
393     }
394     state->textTransformDelta(state->getCharSpace() * state->getHorizScaling(), 0, &dx2, &dy2);
395     dx -= dx2;
396     dy -= dy2;
397     state->transformDelta(dx, dy, &w1, &h1);
398     if (uLen != 0) {
399         w1 /= uLen;
400         h1 /= uLen;
401     }
402     for (i = 0; i < uLen; ++i) {
403         curStr->addChar(state, x1 + i * w1, y1 + i * h1, w1, h1, u[i]);
404     }
405 }
406 
endString()407 void HtmlPage::endString()
408 {
409     HtmlString *p1, *p2;
410     double h, y1, y2;
411 
412     // throw away zero-length strings -- they don't have valid xMin/xMax
413     // values, and they're useless anyway
414     if (curStr->len == 0) {
415         delete curStr;
416         curStr = nullptr;
417         return;
418     }
419 
420     curStr->endString();
421 
422 #if 0 //~tmp
423   if (curStr->yMax - curStr->yMin > 20) {
424     delete curStr;
425     curStr = NULL;
426     return;
427   }
428 #endif
429 
430     // insert string in y-major list
431     h = curStr->yMax - curStr->yMin;
432     y1 = curStr->yMin + 0.5 * h;
433     y2 = curStr->yMin + 0.8 * h;
434     if (rawOrder) {
435         p1 = yxCur1;
436         p2 = nullptr;
437     } else if ((!yxCur1 || (y1 >= yxCur1->yMin && (y2 >= yxCur1->yMax || curStr->xMax >= yxCur1->xMin))) && (!yxCur2 || (y1 < yxCur2->yMin || (y2 < yxCur2->yMax && curStr->xMax < yxCur2->xMin)))) {
438         p1 = yxCur1;
439         p2 = yxCur2;
440     } else {
441         for (p1 = nullptr, p2 = yxStrings; p2; p1 = p2, p2 = p2->yxNext) {
442             if (y1 < p2->yMin || (y2 < p2->yMax && curStr->xMax < p2->xMin))
443                 break;
444         }
445         yxCur2 = p2;
446     }
447     yxCur1 = curStr;
448     if (p1)
449         p1->yxNext = curStr;
450     else
451         yxStrings = curStr;
452     curStr->yxNext = p2;
453     curStr = nullptr;
454 }
455 
strrstr(const char * s,const char * ss)456 static const char *strrstr(const char *s, const char *ss)
457 {
458     const char *p = strstr(s, ss);
459     for (const char *pp = p; pp != nullptr; pp = strstr(p + 1, ss)) {
460         p = pp;
461     }
462     return p;
463 }
464 
CloseTags(GooString * htext,bool & finish_a,bool & finish_italic,bool & finish_bold)465 static void CloseTags(GooString *htext, bool &finish_a, bool &finish_italic, bool &finish_bold)
466 {
467     const char *last_italic = finish_italic && (finish_bold || finish_a) ? strrstr(htext->c_str(), "<i>") : nullptr;
468     const char *last_bold = finish_bold && (finish_italic || finish_a) ? strrstr(htext->c_str(), "<b>") : nullptr;
469     const char *last_a = finish_a && (finish_italic || finish_bold) ? strrstr(htext->c_str(), "<a ") : nullptr;
470     if (finish_a && (finish_italic || finish_bold) && last_a > (last_italic > last_bold ? last_italic : last_bold)) {
471         htext->append("</a>", 4);
472         finish_a = false;
473     }
474     if (finish_italic && finish_bold && last_italic > last_bold) {
475         htext->append("</i>", 4);
476         finish_italic = false;
477     }
478     if (finish_bold)
479         htext->append("</b>", 4);
480     if (finish_italic)
481         htext->append("</i>", 4);
482     if (finish_a)
483         htext->append("</a>");
484 }
485 
486 // Strings are lines of text;
487 // This function aims to combine strings into lines and paragraphs if !noMerge
488 // It may also strip out duplicate strings (if they are on top of each other); sometimes they are to create a font effect
coalesce()489 void HtmlPage::coalesce()
490 {
491     HtmlString *str1, *str2;
492     double space, horSpace, vertSpace, vertOverlap;
493     bool addSpace, addLineBreak;
494     int n, i;
495     double curX, curY;
496 
497 #if 0 //~ for debugging
498   for (str1 = yxStrings; str1; str1 = str1->yxNext) {
499     printf("x=%f..%f  y=%f..%f  size=%2d '",
500 	   str1->xMin, str1->xMax, str1->yMin, str1->yMax,
501 	   (int)(str1->yMax - str1->yMin));
502     for (i = 0; i < str1->len; ++i) {
503       fputc(str1->text[i] & 0xff, stdout);
504     }
505     printf("'\n");
506   }
507   printf("\n------------------------------------------------------------\n\n");
508 #endif
509     str1 = yxStrings;
510 
511     if (!str1)
512         return;
513 
514     //----- discard duplicated text (fake boldface, drop shadows)
515     if (!complexMode) { /* if not in complex mode get rid of duplicate strings */
516         HtmlString *str3;
517         bool found;
518         while (str1) {
519             double size = str1->yMax - str1->yMin;
520             double xLimit = str1->xMin + size;
521             found = false;
522             for (str2 = str1, str3 = str1->yxNext; str3 && str3->xMin < xLimit; str2 = str3, str3 = str2->yxNext) {
523                 if (str3->len == str1->len && !memcmp(str3->text, str1->text, str1->len * sizeof(Unicode)) && fabs(str3->yMin - str1->yMin) < size * 0.2 && fabs(str3->yMax - str1->yMax) < size * 0.2
524                     && fabs(str3->xMax - str1->xMax) < size * 0.1) {
525                     found = true;
526                     // printf("found duplicate!\n");
527                     break;
528                 }
529             }
530             if (found) {
531                 str2->xyNext = str3->xyNext;
532                 str2->yxNext = str3->yxNext;
533                 delete str3;
534             } else {
535                 str1 = str1->yxNext;
536             }
537         }
538     } /*- !complexMode */
539 
540     str1 = yxStrings;
541 
542     const HtmlFont *hfont1 = getFont(str1);
543     if (hfont1->isBold())
544         str1->htext->insert(0, "<b>", 3);
545     if (hfont1->isItalic())
546         str1->htext->insert(0, "<i>", 3);
547     if (str1->getLink() != nullptr) {
548         GooString *ls = str1->getLink()->getLinkStart();
549         str1->htext->insert(0, ls);
550         delete ls;
551     }
552     curX = str1->xMin;
553     curY = str1->yMin;
554 
555     while (str1 && (str2 = str1->yxNext)) {
556         const HtmlFont *hfont2 = getFont(str2);
557         space = str1->yMax - str1->yMin; // the height of the font's bounding box
558         horSpace = str2->xMin - str1->xMax;
559         // if strings line up on left-hand side AND they are on subsequent lines, we need a line break
560         addLineBreak = !noMerge && (fabs(str1->xMin - str2->xMin) < 0.4) && IS_CLOSER(str2->yMax, str1->yMax + space, str1->yMax);
561         vertSpace = str2->yMin - str1->yMax;
562 
563         // printf("coalesce %d %d %f? ", str1->dir, str2->dir, d);
564 
565         if (str2->yMin >= str1->yMin && str2->yMin <= str1->yMax) {
566             vertOverlap = str1->yMax - str2->yMin;
567         } else if (str2->yMax >= str1->yMin && str2->yMax <= str1->yMax) {
568             vertOverlap = str2->yMax - str1->yMin;
569         } else {
570             vertOverlap = 0;
571         }
572 
573         // Combine strings if:
574         //  They appear to be the same font (complex mode only) && going in the same direction AND at least one of the following:
575         //  1.  They appear to be part of the same line of text
576         //  2.  They appear to be subsequent lines of a paragraph
577         //  We assume (1) or (2) above, respectively, based on:
578         //  (1)  strings overlap vertically AND
579         //       horizontal space between end of str1 and start of str2 is consistent with a single space or less;
580         //       when rawOrder, the strings have to overlap vertically by at least 50%
581         //  (2)  Strings flow down the page, but the space between them is not too great, and they are lined up on the left
582         if (((((rawOrder && vertOverlap > 0.5 * space) || (!rawOrder && str2->yMin < str1->yMax)) && (horSpace > -0.5 * space && horSpace < space)) || (vertSpace >= 0 && vertSpace < 0.5 * space && addLineBreak))
583             && (!complexMode || (hfont1->isEqualIgnoreBold(*hfont2))) && // in complex mode fonts must be the same, in other modes fonts do not metter
584             str1->dir == str2->dir // text direction the same
585         ) {
586             //      printf("yes\n");
587             n = str1->len + str2->len;
588             if ((addSpace = horSpace > wordBreakThreshold * space)) {
589                 ++n;
590             }
591             if (addLineBreak) {
592                 ++n;
593             }
594 
595             str1->size = (n + 15) & ~15;
596             str1->text = (Unicode *)grealloc(str1->text, str1->size * sizeof(Unicode));
597             str1->xRight = (double *)grealloc(str1->xRight, str1->size * sizeof(double));
598             if (addSpace) {
599                 str1->text[str1->len] = 0x20;
600                 str1->htext->append(xml ? " " : "&#160;");
601                 str1->xRight[str1->len] = str2->xMin;
602                 ++str1->len;
603             }
604             if (addLineBreak) {
605                 str1->text[str1->len] = '\n';
606                 str1->htext->append("<br/>");
607                 str1->xRight[str1->len] = str2->xMin;
608                 ++str1->len;
609                 str1->yMin = str2->yMin;
610                 str1->yMax = str2->yMax;
611                 str1->xMax = str2->xMax;
612                 int fontLineSize = hfont1->getLineSize();
613                 int curLineSize = (int)(vertSpace + space);
614                 if (curLineSize != fontLineSize) {
615                     HtmlFont *newfnt = new HtmlFont(*hfont1);
616                     newfnt->setLineSize(curLineSize);
617                     str1->fontpos = fonts->AddFont(*newfnt);
618                     delete newfnt;
619                     hfont1 = getFont(str1);
620                     // we have to reget hfont2 because it's location could have
621                     // changed on resize
622                     hfont2 = getFont(str2);
623                 }
624             }
625             for (i = 0; i < str2->len; ++i) {
626                 str1->text[str1->len] = str2->text[i];
627                 str1->xRight[str1->len] = str2->xRight[i];
628                 ++str1->len;
629             }
630 
631             /* fix <i>, <b> if str1 and str2 differ and handle switch of links */
632             const HtmlLink *hlink1 = str1->getLink();
633             const HtmlLink *hlink2 = str2->getLink();
634             bool switch_links = !hlink1 || !hlink2 || !hlink1->isEqualDest(*hlink2);
635             bool finish_a = switch_links && hlink1 != nullptr;
636             bool finish_italic = hfont1->isItalic() && (!hfont2->isItalic() || finish_a);
637             bool finish_bold = hfont1->isBold() && (!hfont2->isBold() || finish_a || finish_italic);
638             CloseTags(str1->htext, finish_a, finish_italic, finish_bold);
639             if (switch_links && hlink2 != nullptr) {
640                 GooString *ls = hlink2->getLinkStart();
641                 str1->htext->append(ls);
642                 delete ls;
643             }
644             if ((!hfont1->isItalic() || finish_italic) && hfont2->isItalic())
645                 str1->htext->append("<i>", 3);
646             if ((!hfont1->isBold() || finish_bold) && hfont2->isBold())
647                 str1->htext->append("<b>", 3);
648 
649             str1->htext->append(str2->htext);
650             // str1 now contains href for link of str2 (if it is defined)
651             str1->link = str2->link;
652             hfont1 = hfont2;
653             if (str2->xMax > str1->xMax) {
654                 str1->xMax = str2->xMax;
655             }
656             if (str2->yMax > str1->yMax) {
657                 str1->yMax = str2->yMax;
658             }
659             str1->yxNext = str2->yxNext;
660             delete str2;
661         } else { // keep strings separate
662             //      printf("no\n");
663             bool finish_a = str1->getLink() != nullptr;
664             bool finish_bold = hfont1->isBold();
665             bool finish_italic = hfont1->isItalic();
666             CloseTags(str1->htext, finish_a, finish_italic, finish_bold);
667 
668             str1->xMin = curX;
669             str1->yMin = curY;
670             str1 = str2;
671             curX = str1->xMin;
672             curY = str1->yMin;
673             hfont1 = hfont2;
674             if (hfont1->isBold())
675                 str1->htext->insert(0, "<b>", 3);
676             if (hfont1->isItalic())
677                 str1->htext->insert(0, "<i>", 3);
678             if (str1->getLink() != nullptr) {
679                 GooString *ls = str1->getLink()->getLinkStart();
680                 str1->htext->insert(0, ls);
681                 delete ls;
682             }
683         }
684     }
685     str1->xMin = curX;
686     str1->yMin = curY;
687 
688     bool finish_bold = hfont1->isBold();
689     bool finish_italic = hfont1->isItalic();
690     bool finish_a = str1->getLink() != nullptr;
691     CloseTags(str1->htext, finish_a, finish_italic, finish_bold);
692 
693 #if 0 //~ for debugging
694   for (str1 = yxStrings; str1; str1 = str1->yxNext) {
695     printf("x=%3d..%3d  y=%3d..%3d  size=%2d ",
696 	   (int)str1->xMin, (int)str1->xMax, (int)str1->yMin, (int)str1->yMax,
697 	   (int)(str1->yMax - str1->yMin));
698     printf("'%s'\n", str1->htext->c_str());
699   }
700   printf("\n------------------------------------------------------------\n\n");
701 #endif
702 }
703 
dumpAsXML(FILE * f,int page)704 void HtmlPage::dumpAsXML(FILE *f, int page)
705 {
706     fprintf(f, "<page number=\"%d\" position=\"absolute\"", page);
707     fprintf(f, " top=\"0\" left=\"0\" height=\"%d\" width=\"%d\">\n", pageHeight, pageWidth);
708 
709     for (int i = fontsPageMarker; i < fonts->size(); i++) {
710         GooString *fontCSStyle = fonts->CSStyle(i);
711         fprintf(f, "\t%s\n", fontCSStyle->c_str());
712         delete fontCSStyle;
713     }
714 
715     for (auto ptr : imgList) {
716         auto img = static_cast<HtmlImage *>(ptr);
717         if (!noRoundedCoordinates) {
718             fprintf(f, "<image top=\"%d\" left=\"%d\" ", xoutRound(img->yMin), xoutRound(img->xMin));
719             fprintf(f, "width=\"%d\" height=\"%d\" ", xoutRound(img->xMax - img->xMin), xoutRound(img->yMax - img->yMin));
720         } else {
721             fprintf(f, "<image top=\"%f\" left=\"%f\" ", img->yMin, img->xMin);
722             fprintf(f, "width=\"%f\" height=\"%f\" ", img->xMax - img->xMin, img->yMax - img->yMin);
723         }
724         fprintf(f, "src=\"%s\"/>\n", img->fName->c_str());
725         delete img;
726     }
727     imgList.clear();
728 
729     for (HtmlString *tmp = yxStrings; tmp; tmp = tmp->yxNext) {
730         if (tmp->htext) {
731             if (!noRoundedCoordinates) {
732                 fprintf(f, "<text top=\"%d\" left=\"%d\" ", xoutRound(tmp->yMin), xoutRound(tmp->xMin));
733                 fprintf(f, "width=\"%d\" height=\"%d\" ", xoutRound(tmp->xMax - tmp->xMin), xoutRound(tmp->yMax - tmp->yMin));
734             } else {
735                 fprintf(f, "<text top=\"%f\" left=\"%f\" ", tmp->yMin, tmp->xMin);
736                 fprintf(f, "width=\"%f\" height=\"%f\" ", tmp->xMax - tmp->xMin, tmp->yMax - tmp->yMin);
737             }
738             fprintf(f, "font=\"%d\">", tmp->fontpos);
739             fputs(tmp->htext->c_str(), f);
740             fputs("</text>\n", f);
741         }
742     }
743     fputs("</page>\n", f);
744 }
745 
printCSS(FILE * f)746 static void printCSS(FILE *f)
747 {
748     // Image flip/flop CSS
749     // Source:
750     // http://stackoverflow.com/questions/1309055/cross-browser-way-to-flip-html-image-via-javascript-css
751     // tested in Chrome, Fx (Linux) and IE9 (W7)
752     static const char css[] = "<style type=\"text/css\">"
753                               "\n"
754                               "<!--"
755                               "\n"
756                               ".xflip {"
757                               "\n"
758                               "    -moz-transform: scaleX(-1);"
759                               "\n"
760                               "    -webkit-transform: scaleX(-1);"
761                               "\n"
762                               "    -o-transform: scaleX(-1);"
763                               "\n"
764                               "    transform: scaleX(-1);"
765                               "\n"
766                               "    filter: fliph;"
767                               "\n"
768                               "}"
769                               "\n"
770                               ".yflip {"
771                               "\n"
772                               "    -moz-transform: scaleY(-1);"
773                               "\n"
774                               "    -webkit-transform: scaleY(-1);"
775                               "\n"
776                               "    -o-transform: scaleY(-1);"
777                               "\n"
778                               "    transform: scaleY(-1);"
779                               "\n"
780                               "    filter: flipv;"
781                               "\n"
782                               "}"
783                               "\n"
784                               ".xyflip {"
785                               "\n"
786                               "    -moz-transform: scaleX(-1) scaleY(-1);"
787                               "\n"
788                               "    -webkit-transform: scaleX(-1) scaleY(-1);"
789                               "\n"
790                               "    -o-transform: scaleX(-1) scaleY(-1);"
791                               "\n"
792                               "    transform: scaleX(-1) scaleY(-1);"
793                               "\n"
794                               "    filter: fliph + flipv;"
795                               "\n"
796                               "}"
797                               "\n"
798                               "-->"
799                               "\n"
800                               "</style>"
801                               "\n";
802 
803     fwrite(css, sizeof(css) - 1, 1, f);
804 }
805 
dumpComplexHeaders(FILE * const file,FILE * & pageFile,int page)806 int HtmlPage::dumpComplexHeaders(FILE *const file, FILE *&pageFile, int page)
807 {
808 
809     if (!noframes) {
810         const std::string pgNum = std::to_string(page);
811         std::string pageFileName(DocName->toStr());
812         if (!singleHtml) {
813             pageFileName += '-' + pgNum + ".html";
814             pageFile = fopen(pageFileName.c_str(), "w");
815         } else {
816             pageFileName += "-html.html";
817             pageFile = fopen(pageFileName.c_str(), "a");
818         }
819 
820         if (!pageFile) {
821             error(errIO, -1, "Couldn't open html file '{0:s}'", pageFileName.c_str());
822             return 1;
823         }
824 
825         if (!singleHtml)
826             fprintf(pageFile, "%s\n<html xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"\" xml:lang=\"\">\n<head>\n<title>Page %d</title>\n\n", DOCTYPE, page);
827         else
828             fprintf(pageFile, "%s\n<html xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"\" xml:lang=\"\">\n<head>\n<title>%s</title>\n\n", DOCTYPE, pageFileName.c_str());
829 
830         const std::string htmlEncoding = HtmlOutputDev::mapEncodingToHtml(globalParams->getTextEncodingName());
831         if (!singleHtml)
832             fprintf(pageFile, "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=%s\"/>\n", htmlEncoding.c_str());
833         else
834             fprintf(pageFile, "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=%s\"/>\n <br/>\n", htmlEncoding.c_str());
835     } else {
836         pageFile = file;
837         fprintf(pageFile, "<!-- Page %d -->\n", page);
838         fprintf(pageFile, "<a name=\"%d\"></a>\n", page);
839     }
840 
841     return 0;
842 }
843 
dumpComplex(FILE * file,int page,const std::vector<std::string> & backgroundImages)844 void HtmlPage::dumpComplex(FILE *file, int page, const std::vector<std::string> &backgroundImages)
845 {
846     FILE *pageFile;
847 
848     if (firstPage == -1)
849         firstPage = page;
850 
851     if (dumpComplexHeaders(file, pageFile, page)) {
852         error(errIO, -1, "Couldn't write headers.");
853         return;
854     }
855 
856     fputs("<style type=\"text/css\">\n<!--\n", pageFile);
857     fputs("\tp {margin: 0; padding: 0;}", pageFile);
858     for (int i = fontsPageMarker; i != fonts->size(); i++) {
859         GooString *fontCSStyle;
860         if (!singleHtml)
861             fontCSStyle = fonts->CSStyle(i);
862         else
863             fontCSStyle = fonts->CSStyle(i, page);
864         fprintf(pageFile, "\t%s\n", fontCSStyle->c_str());
865         delete fontCSStyle;
866     }
867 
868     fputs("-->\n</style>\n", pageFile);
869 
870     if (!noframes) {
871         fputs("</head>\n<body bgcolor=\"#A0A0A0\" vlink=\"blue\" link=\"blue\">\n", pageFile);
872     }
873 
874     fprintf(pageFile, "<div id=\"page%d-div\" style=\"position:relative;width:%dpx;height:%dpx;\">\n", page, pageWidth, pageHeight);
875 
876     if (!ignore && (size_t)(page - firstPage) < backgroundImages.size()) {
877         fprintf(pageFile, "<img width=\"%d\" height=\"%d\" src=\"%s\" alt=\"background image\"/>\n", pageWidth, pageHeight, backgroundImages[page - firstPage].c_str());
878     }
879 
880     for (HtmlString *tmp1 = yxStrings; tmp1; tmp1 = tmp1->yxNext) {
881         if (tmp1->htext) {
882             fprintf(pageFile, "<p style=\"position:absolute;top:%dpx;left:%dpx;white-space:nowrap\" class=\"ft", xoutRound(tmp1->yMin), xoutRound(tmp1->xMin));
883             if (!singleHtml) {
884                 fputc('0', pageFile);
885             } else {
886                 fprintf(pageFile, "%d", page);
887             }
888             fprintf(pageFile, "%d\">", tmp1->fontpos);
889             fputs(tmp1->htext->c_str(), pageFile);
890             fputs("</p>\n", pageFile);
891         }
892     }
893 
894     fputs("</div>\n", pageFile);
895 
896     if (!noframes) {
897         fputs("</body>\n</html>\n", pageFile);
898         fclose(pageFile);
899     }
900 }
901 
dump(FILE * f,int pageNum,const std::vector<std::string> & backgroundImages)902 void HtmlPage::dump(FILE *f, int pageNum, const std::vector<std::string> &backgroundImages)
903 {
904     if (complexMode || singleHtml) {
905         if (xml)
906             dumpAsXML(f, pageNum);
907         if (!xml)
908             dumpComplex(f, pageNum, backgroundImages);
909     } else {
910         fprintf(f, "<a name=%d></a>", pageNum);
911         // Loop over the list of image names on this page
912         for (auto ptr : imgList) {
913             auto img = static_cast<HtmlImage *>(ptr);
914 
915             // see printCSS() for class names
916             const char *styles[4] = { "", " class=\"xflip\"", " class=\"yflip\"", " class=\"xyflip\"" };
917             int style_index = 0;
918             if (img->xMin > img->xMax)
919                 style_index += 1; // xFlip
920             if (img->yMin > img->yMax)
921                 style_index += 2; // yFlip
922 
923             fprintf(f, "<img%s src=\"%s\"/><br/>\n", styles[style_index], img->fName->c_str());
924             delete img;
925         }
926         imgList.clear();
927 
928         GooString *str;
929         for (HtmlString *tmp = yxStrings; tmp; tmp = tmp->yxNext) {
930             if (tmp->htext) {
931                 str = new GooString(tmp->htext);
932                 fputs(str->c_str(), f);
933                 delete str;
934                 fputs("<br/>\n", f);
935             }
936         }
937         fputs("<hr/>\n", f);
938     }
939 }
940 
clear()941 void HtmlPage::clear()
942 {
943     HtmlString *p1, *p2;
944 
945     if (curStr) {
946         delete curStr;
947         curStr = nullptr;
948     }
949     for (p1 = yxStrings; p1; p1 = p2) {
950         p2 = p1->yxNext;
951         delete p1;
952     }
953     yxStrings = nullptr;
954     xyStrings = nullptr;
955     yxCur1 = yxCur2 = nullptr;
956 
957     if (!noframes) {
958         delete fonts;
959         fonts = new HtmlFontAccu();
960         fontsPageMarker = 0;
961     } else {
962         fontsPageMarker = fonts->size();
963     }
964 
965     delete links;
966     links = new HtmlLinks();
967 }
968 
setDocName(const char * fname)969 void HtmlPage::setDocName(const char *fname)
970 {
971     DocName = new GooString(fname);
972 }
973 
addImage(GooString * fname,GfxState * state)974 void HtmlPage::addImage(GooString *fname, GfxState *state)
975 {
976     HtmlImage *img = new HtmlImage(fname, state);
977     imgList.push_back(img);
978 }
979 
980 //------------------------------------------------------------------------
981 // HtmlMetaVar
982 //------------------------------------------------------------------------
983 
HtmlMetaVar(const char * _name,const char * _content)984 HtmlMetaVar::HtmlMetaVar(const char *_name, const char *_content)
985 {
986     name = new GooString(_name);
987     content = new GooString(_content);
988 }
989 
~HtmlMetaVar()990 HtmlMetaVar::~HtmlMetaVar()
991 {
992     delete name;
993     delete content;
994 }
995 
toString() const996 GooString *HtmlMetaVar::toString() const
997 {
998     GooString *result = new GooString("<meta name=\"");
999     result->append(name);
1000     result->append("\" content=\"");
1001     result->append(content);
1002     result->append("\"/>");
1003     return result;
1004 }
1005 
1006 //------------------------------------------------------------------------
1007 // HtmlOutputDev
1008 //------------------------------------------------------------------------
1009 
1010 static const char *HtmlEncodings[][2] = { { "Latin1", "ISO-8859-1" }, { nullptr, nullptr } };
1011 
mapEncodingToHtml(const std::string & encoding)1012 std::string HtmlOutputDev::mapEncodingToHtml(const std::string &encoding)
1013 {
1014     for (int i = 0; HtmlEncodings[i][0] != nullptr; i++) {
1015         if (encoding == HtmlEncodings[i][0]) {
1016             return HtmlEncodings[i][1];
1017         }
1018     }
1019     return encoding;
1020 }
1021 
doFrame(int firstPage)1022 void HtmlOutputDev::doFrame(int firstPage)
1023 {
1024     GooString *fName = new GooString(Docname);
1025     fName->append(".html");
1026 
1027     if (!(fContentsFrame = fopen(fName->c_str(), "w"))) {
1028         error(errIO, -1, "Couldn't open html file '{0:t}'", fName);
1029         delete fName;
1030         return;
1031     }
1032 
1033     delete fName;
1034 
1035     const std::string baseName = gbasename(Docname->c_str());
1036     fputs(DOCTYPE, fContentsFrame);
1037     fputs("\n<html>", fContentsFrame);
1038     fputs("\n<head>", fContentsFrame);
1039     fprintf(fContentsFrame, "\n<title>%s</title>", docTitle->c_str());
1040     const std::string htmlEncoding = mapEncodingToHtml(globalParams->getTextEncodingName());
1041     fprintf(fContentsFrame, "\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=%s\"/>\n", htmlEncoding.c_str());
1042     dumpMetaVars(fContentsFrame);
1043     fprintf(fContentsFrame, "</head>\n");
1044     fputs("<frameset cols=\"100,*\">\n", fContentsFrame);
1045     fprintf(fContentsFrame, "<frame name=\"links\" src=\"%s_ind.html\"/>\n", baseName.c_str());
1046     fputs("<frame name=\"contents\" src=", fContentsFrame);
1047     if (complexMode)
1048         fprintf(fContentsFrame, "\"%s-%d.html\"", baseName.c_str(), firstPage);
1049     else
1050         fprintf(fContentsFrame, "\"%ss.html\"", baseName.c_str());
1051 
1052     fputs("/>\n</frameset>\n</html>\n", fContentsFrame);
1053 
1054     fclose(fContentsFrame);
1055 }
1056 
HtmlOutputDev(Catalog * catalogA,const char * fileName,const char * title,const char * author,const char * keywords,const char * subject,const char * date,bool rawOrderA,int firstPage,bool outline)1057 HtmlOutputDev::HtmlOutputDev(Catalog *catalogA, const char *fileName, const char *title, const char *author, const char *keywords, const char *subject, const char *date, bool rawOrderA, int firstPage, bool outline)
1058 {
1059     catalog = catalogA;
1060     fContentsFrame = nullptr;
1061     page = nullptr;
1062     docTitle = new GooString(title);
1063     pages = nullptr;
1064     dumpJPEG = true;
1065     // write = true;
1066     rawOrder = rawOrderA;
1067     this->doOutline = outline;
1068     ok = false;
1069     // this->firstPage = firstPage;
1070     // pageNum=firstPage;
1071     // open file
1072     needClose = false;
1073     pages = new HtmlPage(rawOrder);
1074 
1075     glMetaVars.push_back(new HtmlMetaVar("generator", "pdftohtml 0.36"));
1076     if (author)
1077         glMetaVars.push_back(new HtmlMetaVar("author", author));
1078     if (keywords)
1079         glMetaVars.push_back(new HtmlMetaVar("keywords", keywords));
1080     if (date)
1081         glMetaVars.push_back(new HtmlMetaVar("date", date));
1082     if (subject)
1083         glMetaVars.push_back(new HtmlMetaVar("subject", subject));
1084 
1085     maxPageWidth = 0;
1086     maxPageHeight = 0;
1087 
1088     pages->setDocName(fileName);
1089     Docname = new GooString(fileName);
1090 
1091     // for non-xml output (complex or simple) with frames generate the left frame
1092     if (!xml && !noframes) {
1093         if (!singleHtml) {
1094             GooString *left = new GooString(fileName);
1095             left->append("_ind.html");
1096 
1097             doFrame(firstPage);
1098 
1099             if (!(fContentsFrame = fopen(left->c_str(), "w"))) {
1100                 error(errIO, -1, "Couldn't open html file '{0:t}'", left);
1101                 delete left;
1102                 return;
1103             }
1104             delete left;
1105             fputs(DOCTYPE, fContentsFrame);
1106             fputs("<html xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"\" xml:lang=\"\">\n<head>\n<title></title>\n</head>\n<body>\n", fContentsFrame);
1107 
1108             if (doOutline) {
1109                 fprintf(fContentsFrame, "<a href=\"%s%s\" target=\"contents\">Outline</a><br/>", gbasename(Docname->c_str()).c_str(), complexMode ? "-outline.html" : "s.html#outline");
1110             }
1111         }
1112         if (!complexMode) { /* not in complex mode */
1113 
1114             GooString *right = new GooString(fileName);
1115             right->append("s.html");
1116 
1117             if (!(page = fopen(right->c_str(), "w"))) {
1118                 error(errIO, -1, "Couldn't open html file '{0:t}'", right);
1119                 delete right;
1120                 return;
1121             }
1122             delete right;
1123             fputs(DOCTYPE, page);
1124             fputs("<html>\n<head>\n<title></title>\n", page);
1125             printCSS(page);
1126             fputs("</head>\n<body>\n", page);
1127         }
1128     }
1129 
1130     if (noframes) {
1131         if (stout)
1132             page = stdout;
1133         else {
1134             GooString *right = new GooString(fileName);
1135             if (!xml)
1136                 right->append(".html");
1137             if (xml)
1138                 right->append(".xml");
1139             if (!(page = fopen(right->c_str(), "w"))) {
1140                 error(errIO, -1, "Couldn't open html file '{0:t}'", right);
1141                 delete right;
1142                 return;
1143             }
1144             delete right;
1145         }
1146 
1147         const std::string htmlEncoding = mapEncodingToHtml(globalParams->getTextEncodingName());
1148         if (xml) {
1149             fprintf(page, "<?xml version=\"1.0\" encoding=\"%s\"?>\n", htmlEncoding.c_str());
1150             fputs("<!DOCTYPE pdf2xml SYSTEM \"pdf2xml.dtd\">\n\n", page);
1151             fprintf(page, "<pdf2xml producer=\"%s\" version=\"%s\">\n", PACKAGE_NAME, PACKAGE_VERSION);
1152         } else {
1153             fprintf(page, "%s\n<html xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"\" xml:lang=\"\">\n<head>\n<title>%s</title>\n", DOCTYPE, docTitle->c_str());
1154 
1155             fprintf(page, "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=%s\"/>\n", htmlEncoding.c_str());
1156 
1157             dumpMetaVars(page);
1158             printCSS(page);
1159             fprintf(page, "</head>\n");
1160             fprintf(page, "<body bgcolor=\"#A0A0A0\" vlink=\"blue\" link=\"blue\">\n");
1161         }
1162     }
1163     ok = true;
1164 }
1165 
~HtmlOutputDev()1166 HtmlOutputDev::~HtmlOutputDev()
1167 {
1168     delete Docname;
1169     delete docTitle;
1170 
1171     for (auto entry : glMetaVars) {
1172         delete entry;
1173     }
1174 
1175     if (fContentsFrame) {
1176         fputs("</body>\n</html>\n", fContentsFrame);
1177         fclose(fContentsFrame);
1178     }
1179     if (page != nullptr) {
1180         if (xml) {
1181             fputs("</pdf2xml>\n", page);
1182             fclose(page);
1183         } else if (!complexMode || xml || noframes) {
1184             fputs("</body>\n</html>\n", page);
1185             fclose(page);
1186         }
1187     }
1188     if (pages)
1189         delete pages;
1190 }
1191 
startPage(int pageNumA,GfxState * state,XRef * xref)1192 void HtmlOutputDev::startPage(int pageNumA, GfxState *state, XRef *xref)
1193 {
1194 #if 0
1195   if (mode&&!xml){
1196     if (write){
1197       write=false;
1198       GooString* fname=Dirname(Docname);
1199       fname->append("image.log");
1200       if((tin=fopen(getFileNameFromPath(fname->c_str(),fname->getLength()),"w"))==NULL){
1201 	printf("Error : can not open %s",fname);
1202 	exit(1);
1203       }
1204       delete fname;
1205     // if(state->getRotation()!=0)
1206     //  fprintf(tin,"ROTATE=%d rotate %d neg %d neg translate\n",state->getRotation(),state->getX1(),-state->getY1());
1207     // else
1208       fprintf(tin,"ROTATE=%d neg %d neg translate\n",state->getX1(),state->getY1());
1209     }
1210   }
1211 #endif
1212 
1213     pageNum = pageNumA;
1214     const std::string str = gbasename(Docname->c_str());
1215     pages->clear();
1216     if (!noframes) {
1217         if (fContentsFrame) {
1218             if (complexMode)
1219                 fprintf(fContentsFrame, "<a href=\"%s-%d.html\"", str.c_str(), pageNum);
1220             else
1221                 fprintf(fContentsFrame, "<a href=\"%ss.html#%d\"", str.c_str(), pageNum);
1222             fprintf(fContentsFrame, " target=\"contents\" >Page %d</a><br/>\n", pageNum);
1223         }
1224     }
1225 
1226     pages->pageWidth = static_cast<int>(state->getPageWidth());
1227     pages->pageHeight = static_cast<int>(state->getPageHeight());
1228 }
1229 
endPage()1230 void HtmlOutputDev::endPage()
1231 {
1232     std::unique_ptr<Links> linksList = docPage->getLinks();
1233     for (int i = 0; i < linksList->getNumLinks(); ++i) {
1234         doProcessLink(linksList->getLink(i));
1235     }
1236 
1237     pages->conv();
1238     pages->coalesce();
1239     pages->dump(page, pageNum, backgroundImages);
1240 
1241     // I don't yet know what to do in the case when there are pages of different
1242     // sizes and we want complex output: running ghostscript many times
1243     // seems very inefficient. So for now I'll just use last page's size
1244     maxPageWidth = pages->pageWidth;
1245     maxPageHeight = pages->pageHeight;
1246 
1247     // if(!noframes&&!xml) fputs("<br/>\n", fContentsFrame);
1248     if (!stout && !globalParams->getErrQuiet())
1249         printf("Page-%d\n", (pageNum));
1250 }
1251 
addBackgroundImage(const std::string & img)1252 void HtmlOutputDev::addBackgroundImage(const std::string &img)
1253 {
1254     backgroundImages.push_back(img);
1255 }
1256 
updateFont(GfxState * state)1257 void HtmlOutputDev::updateFont(GfxState *state)
1258 {
1259     pages->updateFont(state);
1260 }
1261 
beginString(GfxState * state,const GooString * s)1262 void HtmlOutputDev::beginString(GfxState *state, const GooString *s)
1263 {
1264     pages->beginString(state, s);
1265 }
1266 
endString(GfxState * state)1267 void HtmlOutputDev::endString(GfxState *state)
1268 {
1269     pages->endString();
1270 }
1271 
drawChar(GfxState * state,double x,double y,double dx,double dy,double originX,double originY,CharCode code,int,const Unicode * u,int uLen)1272 void HtmlOutputDev::drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode code, int /*nBytes*/, const Unicode *u, int uLen)
1273 {
1274     if (!showHidden && (state->getRender() & 3) == 3) {
1275         return;
1276     }
1277     pages->addChar(state, x, y, dx, dy, originX, originY, u, uLen);
1278 }
1279 
drawJpegImage(GfxState * state,Stream * str)1280 void HtmlOutputDev::drawJpegImage(GfxState *state, Stream *str)
1281 {
1282     InMemoryFile ims;
1283     FILE *f1 = nullptr;
1284     int c;
1285 
1286     // open the image file
1287     GooString *fName = createImageFileName("jpg");
1288     f1 = dataUrls ? ims.open("wb") : fopen(fName->c_str(), "wb");
1289     if (!f1) {
1290         error(errIO, -1, "Couldn't open image file '{0:t}'", fName);
1291         delete fName;
1292         return;
1293     }
1294 
1295     // initialize stream
1296     str = str->getNextStream();
1297     str->reset();
1298 
1299     // copy the stream
1300     while ((c = str->getChar()) != EOF)
1301         fputc(c, f1);
1302 
1303     fclose(f1);
1304 
1305     if (dataUrls) {
1306         delete fName;
1307         fName = new GooString(std::string("data:image/jpeg;base64,") + gbase64Encode(ims.getBuffer()));
1308     }
1309     pages->addImage(fName, state);
1310 }
1311 
drawPngImage(GfxState * state,Stream * str,int width,int height,GfxImageColorMap * colorMap,bool isMask)1312 void HtmlOutputDev::drawPngImage(GfxState *state, Stream *str, int width, int height, GfxImageColorMap *colorMap, bool isMask)
1313 {
1314 #ifdef ENABLE_LIBPNG
1315     FILE *f1;
1316     InMemoryFile ims;
1317 
1318     if (!colorMap && !isMask) {
1319         error(errInternal, -1, "Can't have color image without a color map");
1320         return;
1321     }
1322 
1323     // open the image file
1324     GooString *fName = createImageFileName("png");
1325     f1 = dataUrls ? ims.open("wb") : fopen(fName->c_str(), "wb");
1326     if (!f1) {
1327         error(errIO, -1, "Couldn't open image file '{0:t}'", fName);
1328         delete fName;
1329         return;
1330     }
1331 
1332     PNGWriter *writer = new PNGWriter(isMask ? PNGWriter::MONOCHROME : PNGWriter::RGB);
1333     // TODO can we calculate the resolution of the image?
1334     if (!writer->init(f1, width, height, 72, 72)) {
1335         error(errInternal, -1, "Can't init PNG for image '{0:t}'", fName);
1336         delete writer;
1337         fclose(f1);
1338         return;
1339     }
1340 
1341     if (!isMask) {
1342         unsigned char *p;
1343         GfxRGB rgb;
1344         png_byte *row = (png_byte *)gmalloc(3 * width); // 3 bytes/pixel: RGB
1345         png_bytep *row_pointer = &row;
1346 
1347         // Initialize the image stream
1348         ImageStream *imgStr = new ImageStream(str, width, colorMap->getNumPixelComps(), colorMap->getBits());
1349         imgStr->reset();
1350 
1351         // For each line...
1352         for (int y = 0; y < height; y++) {
1353 
1354             // Convert into a PNG row
1355             p = imgStr->getLine();
1356             if (!p) {
1357                 error(errIO, -1, "Failed to read PNG. '{0:t}' will be incorrect", fName);
1358                 delete fName;
1359                 gfree(row);
1360                 delete writer;
1361                 delete imgStr;
1362                 fclose(f1);
1363                 return;
1364             }
1365             for (int x = 0; x < width; x++) {
1366                 colorMap->getRGB(p, &rgb);
1367                 // Write the RGB pixels into the row
1368                 row[3 * x] = colToByte(rgb.r);
1369                 row[3 * x + 1] = colToByte(rgb.g);
1370                 row[3 * x + 2] = colToByte(rgb.b);
1371                 p += colorMap->getNumPixelComps();
1372             }
1373 
1374             if (!writer->writeRow(row_pointer)) {
1375                 error(errIO, -1, "Failed to write into PNG '{0:t}'", fName);
1376                 delete writer;
1377                 delete imgStr;
1378                 fclose(f1);
1379                 return;
1380             }
1381         }
1382         gfree(row);
1383         imgStr->close();
1384         delete imgStr;
1385     } else { // isMask == true
1386         int size = (width + 7) / 8;
1387 
1388         // PDF masks use 0 = draw current color, 1 = leave unchanged.
1389         // We invert this to provide the standard interpretation of alpha
1390         // (0 = transparent, 1 = opaque). If the colorMap already inverts
1391         // the mask we leave the data unchanged.
1392         int invert_bits = 0xff;
1393         if (colorMap) {
1394             GfxGray gray;
1395             unsigned char zero[gfxColorMaxComps];
1396             memset(zero, 0, sizeof(zero));
1397             colorMap->getGray(zero, &gray);
1398             if (colToByte(gray) == 0)
1399                 invert_bits = 0x00;
1400         }
1401 
1402         str->reset();
1403         unsigned char *png_row = (unsigned char *)gmalloc(size);
1404 
1405         for (int ri = 0; ri < height; ++ri) {
1406             for (int i = 0; i < size; i++)
1407                 png_row[i] = str->getChar() ^ invert_bits;
1408 
1409             if (!writer->writeRow(&png_row)) {
1410                 error(errIO, -1, "Failed to write into PNG '{0:t}'", fName);
1411                 delete writer;
1412                 fclose(f1);
1413                 gfree(png_row);
1414                 return;
1415             }
1416         }
1417         str->close();
1418         gfree(png_row);
1419     }
1420 
1421     str->close();
1422 
1423     writer->close();
1424     delete writer;
1425     fclose(f1);
1426 
1427     if (dataUrls) {
1428         delete fName;
1429         fName = new GooString(std::string("data:image/png;base64,") + gbase64Encode(ims.getBuffer()));
1430     }
1431     pages->addImage(fName, state);
1432 #else
1433     return;
1434 #endif
1435 }
1436 
createImageFileName(const char * ext)1437 GooString *HtmlOutputDev::createImageFileName(const char *ext)
1438 {
1439     return GooString::format("{0:s}-{1:d}_{2:d}.{3:s}", Docname->c_str(), pageNum, pages->getNumImages() + 1, ext);
1440 }
1441 
drawImageMask(GfxState * state,Object * ref,Stream * str,int width,int height,bool invert,bool interpolate,bool inlineImg)1442 void HtmlOutputDev::drawImageMask(GfxState *state, Object *ref, Stream *str, int width, int height, bool invert, bool interpolate, bool inlineImg)
1443 {
1444 
1445     if (ignore || (complexMode && !xml)) {
1446         OutputDev::drawImageMask(state, ref, str, width, height, invert, interpolate, inlineImg);
1447         return;
1448     }
1449 
1450     // dump JPEG file
1451     if (dumpJPEG && str->getKind() == strDCT) {
1452         drawJpegImage(state, str);
1453     } else {
1454 #ifdef ENABLE_LIBPNG
1455         drawPngImage(state, str, width, height, nullptr, true);
1456 #else
1457         OutputDev::drawImageMask(state, ref, str, width, height, invert, interpolate, inlineImg);
1458 #endif
1459     }
1460 }
1461 
drawImage(GfxState * state,Object * ref,Stream * str,int width,int height,GfxImageColorMap * colorMap,bool interpolate,const int * maskColors,bool inlineImg)1462 void HtmlOutputDev::drawImage(GfxState *state, Object *ref, Stream *str, int width, int height, GfxImageColorMap *colorMap, bool interpolate, const int *maskColors, bool inlineImg)
1463 {
1464 
1465     if (ignore || (complexMode && !xml)) {
1466         OutputDev::drawImage(state, ref, str, width, height, colorMap, interpolate, maskColors, inlineImg);
1467         return;
1468     }
1469 
1470     /*if( !globalParams->getErrQuiet() )
1471       printf("image stream of kind %d\n", str->getKind());*/
1472     // dump JPEG file
1473     if (dumpJPEG && str->getKind() == strDCT && (colorMap->getNumPixelComps() == 1 || colorMap->getNumPixelComps() == 3) && !inlineImg) {
1474         drawJpegImage(state, str);
1475     } else {
1476 #ifdef ENABLE_LIBPNG
1477         drawPngImage(state, str, width, height, colorMap);
1478 #else
1479         OutputDev::drawImage(state, ref, str, width, height, colorMap, interpolate, maskColors, inlineImg);
1480 #endif
1481     }
1482 }
1483 
doProcessLink(AnnotLink * link)1484 void HtmlOutputDev::doProcessLink(AnnotLink *link)
1485 {
1486     double _x1, _y1, _x2, _y2;
1487     int x1, y1, x2, y2;
1488 
1489     link->getRect(&_x1, &_y1, &_x2, &_y2);
1490     cvtUserToDev(_x1, _y1, &x1, &y1);
1491 
1492     cvtUserToDev(_x2, _y2, &x2, &y2);
1493 
1494     GooString *_dest = getLinkDest(link);
1495     HtmlLink t((double)x1, (double)y2, (double)x2, (double)y1, _dest);
1496     pages->AddLink(t);
1497     delete _dest;
1498 }
1499 
getLinkDest(AnnotLink * link)1500 GooString *HtmlOutputDev::getLinkDest(AnnotLink *link)
1501 {
1502     if (!link->getAction())
1503         return new GooString();
1504     switch (link->getAction()->getKind()) {
1505     case actionGoTo: {
1506         int destPage = 1;
1507         LinkGoTo *ha = (LinkGoTo *)link->getAction();
1508         std::unique_ptr<LinkDest> dest;
1509         if (ha->getDest() != nullptr)
1510             dest = std::unique_ptr<LinkDest>(ha->getDest()->copy());
1511         else if (ha->getNamedDest() != nullptr)
1512             dest = catalog->findDest(ha->getNamedDest());
1513 
1514         if (dest) {
1515             GooString *file = new GooString(gbasename(Docname->c_str()));
1516 
1517             if (dest->isPageRef()) {
1518                 const Ref pageref = dest->getPageRef();
1519                 destPage = catalog->findPage(pageref);
1520             } else {
1521                 destPage = dest->getPageNum();
1522             }
1523 
1524             /* 		complex 	simple
1525               frames		file-4.html	files.html#4
1526               noframes	file.html#4	file.html#4
1527              */
1528             if (noframes) {
1529                 file->append(".html#");
1530                 file->append(std::to_string(destPage));
1531             } else {
1532                 if (complexMode) {
1533                     file->append("-");
1534                     file->append(std::to_string(destPage));
1535                     file->append(".html");
1536                 } else {
1537                     file->append("s.html#");
1538                     file->append(std::to_string(destPage));
1539                 }
1540             }
1541 
1542             if (printCommands)
1543                 printf(" link to page %d ", destPage);
1544             return file;
1545         } else {
1546             return new GooString();
1547         }
1548     }
1549     case actionGoToR: {
1550         LinkGoToR *ha = (LinkGoToR *)link->getAction();
1551         LinkDest *dest = nullptr;
1552         int destPage = 1;
1553         GooString *file = new GooString();
1554         if (ha->getFileName()) {
1555             delete file;
1556             file = new GooString(ha->getFileName()->c_str());
1557         }
1558         if (ha->getDest() != nullptr)
1559             dest = ha->getDest()->copy();
1560         if (dest && file) {
1561             if (!(dest->isPageRef()))
1562                 destPage = dest->getPageNum();
1563             delete dest;
1564 
1565             if (printCommands)
1566                 printf(" link to page %d ", destPage);
1567             if (printHtml) {
1568                 const char *p = file->c_str() + file->getLength() - 4;
1569                 if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF")) {
1570                     file->del(file->getLength() - 4, 4);
1571                     file->append(".html");
1572                 }
1573                 file->append('#');
1574                 file->append(std::to_string(destPage));
1575             }
1576         }
1577         if (printCommands && file)
1578             printf("filename %s\n", file->c_str());
1579         return file;
1580     }
1581     case actionURI: {
1582         LinkURI *ha = (LinkURI *)link->getAction();
1583         GooString *file = new GooString(ha->getURI());
1584         // printf("uri : %s\n",file->c_str());
1585         return file;
1586     }
1587     case actionLaunch:
1588         if (printHtml) {
1589             LinkLaunch *ha = (LinkLaunch *)link->getAction();
1590             GooString *file = new GooString(ha->getFileName()->c_str());
1591             const char *p = file->c_str() + file->getLength() - 4;
1592             if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF")) {
1593                 file->del(file->getLength() - 4, 4);
1594                 file->append(".html");
1595             }
1596             if (printCommands)
1597                 printf("filename %s", file->c_str());
1598 
1599             return file;
1600         }
1601         // fallthrough
1602     default:
1603         return new GooString();
1604     }
1605 }
1606 
dumpMetaVars(FILE * file)1607 void HtmlOutputDev::dumpMetaVars(FILE *file)
1608 {
1609     GooString *var;
1610 
1611     for (const HtmlMetaVar *t : glMetaVars) {
1612         var = t->toString();
1613         fprintf(file, "%s\n", var->c_str());
1614         delete var;
1615     }
1616 }
1617 
dumpDocOutline(PDFDoc * doc)1618 bool HtmlOutputDev::dumpDocOutline(PDFDoc *doc)
1619 {
1620     FILE *output = nullptr;
1621     bool bClose = false;
1622 
1623     if (!ok)
1624         return false;
1625 
1626     Outline *outline = doc->getOutline();
1627     if (!outline)
1628         return false;
1629 
1630     const std::vector<OutlineItem *> *outlines = outline->getItems();
1631     if (!outlines)
1632         return false;
1633 
1634     if (!complexMode || xml) {
1635         output = page;
1636     } else if (complexMode && !xml) {
1637         if (noframes) {
1638             output = page;
1639             fputs("<hr/>\n", output);
1640         } else {
1641             GooString *str = Docname->copy();
1642             str->append("-outline.html");
1643             output = fopen(str->c_str(), "w");
1644             delete str;
1645             if (output == nullptr)
1646                 return false;
1647             bClose = true;
1648 
1649             const std::string htmlEncoding = HtmlOutputDev::mapEncodingToHtml(globalParams->getTextEncodingName());
1650 
1651             fprintf(output,
1652                     "<html xmlns=\"http://www.w3.org/1999/xhtml\" "
1653                     "lang=\"\" xml:lang=\"\">\n"
1654                     "<head>\n"
1655                     "<title>Document Outline</title>\n"
1656                     "<meta http-equiv=\"Content-Type\" content=\"text/html; "
1657                     "charset=%s\"/>\n"
1658                     "</head>\n<body>\n",
1659                     htmlEncoding.c_str());
1660         }
1661     }
1662 
1663     if (!xml) {
1664         bool done = newHtmlOutlineLevel(output, outlines);
1665         if (done && !complexMode)
1666             fputs("<hr/>\n", output);
1667 
1668         if (bClose) {
1669             fputs("</body>\n</html>\n", output);
1670             fclose(output);
1671         }
1672     } else
1673         newXmlOutlineLevel(output, outlines);
1674 
1675     return true;
1676 }
1677 
newHtmlOutlineLevel(FILE * output,const std::vector<OutlineItem * > * outlines,int level)1678 bool HtmlOutputDev::newHtmlOutlineLevel(FILE *output, const std::vector<OutlineItem *> *outlines, int level)
1679 {
1680     bool atLeastOne = false;
1681 
1682     if (level == 1) {
1683         fputs("<a name=\"outline\"></a>", output);
1684         fputs("<h1>Document Outline</h1>\n", output);
1685     }
1686     fputs("<ul>\n", output);
1687 
1688     for (OutlineItem *item : *outlines) {
1689         GooString *titleStr = HtmlFont::HtmlFilter(item->getTitle(), item->getTitleLength());
1690 
1691         GooString *linkName = nullptr;
1692         ;
1693         const int itemPage = getOutlinePageNum(item);
1694         if (itemPage > 0) {
1695             /*		complex		simple
1696             frames		file-4.html	files.html#4
1697             noframes	file.html#4	file.html#4
1698             */
1699             linkName = new GooString(gbasename(Docname->c_str()));
1700             if (noframes) {
1701                 linkName->append(".html#");
1702                 linkName->append(std::to_string(itemPage));
1703             } else {
1704                 if (complexMode) {
1705                     linkName->append("-");
1706                     linkName->append(std::to_string(itemPage));
1707                     linkName->append(".html");
1708                 } else {
1709                     linkName->append("s.html#");
1710                     linkName->append(std::to_string(itemPage));
1711                 }
1712             }
1713         }
1714 
1715         fputs("<li>", output);
1716         if (linkName)
1717             fprintf(output, "<a href=\"%s\">", linkName->c_str());
1718         fputs(titleStr->c_str(), output);
1719         if (linkName) {
1720             fputs("</a>", output);
1721             delete linkName;
1722         }
1723         delete titleStr;
1724         atLeastOne = true;
1725 
1726         item->open();
1727         if (item->hasKids() && item->getKids()) {
1728             fputs("\n", output);
1729             newHtmlOutlineLevel(output, item->getKids(), level + 1);
1730         }
1731         fputs("</li>\n", output);
1732     }
1733     fputs("</ul>\n", output);
1734 
1735     return atLeastOne;
1736 }
1737 
newXmlOutlineLevel(FILE * output,const std::vector<OutlineItem * > * outlines)1738 void HtmlOutputDev::newXmlOutlineLevel(FILE *output, const std::vector<OutlineItem *> *outlines)
1739 {
1740     fputs("<outline>\n", output);
1741 
1742     for (OutlineItem *item : *outlines) {
1743         GooString *titleStr = HtmlFont::HtmlFilter(item->getTitle(), item->getTitleLength());
1744         const int itemPage = getOutlinePageNum(item);
1745         if (itemPage > 0) {
1746             fprintf(output, "<item page=\"%d\">%s</item>\n", itemPage, titleStr->c_str());
1747         } else {
1748             fprintf(output, "<item>%s</item>\n", titleStr->c_str());
1749         }
1750         delete titleStr;
1751 
1752         item->open();
1753         if (item->hasKids() && item->getKids()) {
1754             newXmlOutlineLevel(output, item->getKids());
1755         }
1756     }
1757 
1758     fputs("</outline>\n", output);
1759 }
1760 
getOutlinePageNum(OutlineItem * item)1761 int HtmlOutputDev::getOutlinePageNum(OutlineItem *item)
1762 {
1763     const LinkAction *action = item->getAction();
1764     const LinkGoTo *link = nullptr;
1765     std::unique_ptr<LinkDest> linkdest;
1766     int pagenum = -1;
1767 
1768     if (!action || action->getKind() != actionGoTo)
1769         return pagenum;
1770 
1771     link = static_cast<const LinkGoTo *>(action);
1772 
1773     if (!link || !link->isOk())
1774         return pagenum;
1775 
1776     if (link->getDest())
1777         linkdest = std::unique_ptr<LinkDest>(link->getDest()->copy());
1778     else if (link->getNamedDest())
1779         linkdest = catalog->findDest(link->getNamedDest());
1780 
1781     if (!linkdest)
1782         return pagenum;
1783 
1784     if (linkdest->isPageRef()) {
1785         const Ref pageref = linkdest->getPageRef();
1786         pagenum = catalog->findPage(pageref);
1787     } else {
1788         pagenum = linkdest->getPageNum();
1789     }
1790 
1791     return pagenum;
1792 }
1793