1 //========================================================================
2 //
3 // HTMLGen.cc
4 //
5 // Copyright 2010 Glyph & Cog, LLC
6 //
7 //========================================================================
8
9 //~ to do:
10 //~ - fonts
11 //~ - underlined? (underlines are present in the background image)
12 //~ - include the original font name in the CSS entry (before the
13 //~ generic serif/sans-serif/monospace name)
14 //~ - check that htmlDir exists and is a directory
15 //~ - links:
16 //~ - links to pages
17 //~ - links to named destinations
18 //~ - links to URLs
19 //~ - rotated text should go in the background image
20 //~ - metadata
21 //~ - PDF outline
22
23 #include <aconf.h>
24
25 #ifdef USE_GCC_PRAGMAS
26 #pragma implementation
27 #endif
28
29 #include <stdlib.h>
30 #include <png.h>
31 #include "gmem.h"
32 #include "GString.h"
33 #include "GList.h"
34 #include "SplashBitmap.h"
35 #include "PDFDoc.h"
36 #include "TextOutputDev.h"
37 #include "SplashOutputDev.h"
38 #include "ErrorCodes.h"
39 #if EVAL_MODE
40 # include "SplashMath.h"
41 # include "Splash.h"
42 # include "BuiltinFontTables.h"
43 # include "FontEncodingTables.h"
44 #endif
45 #include "HTMLGen.h"
46
47 #ifdef _WIN32
48 # define strcasecmp stricmp
49 # define strncasecmp strnicmp
50 #endif
51
52 //------------------------------------------------------------------------
53
54 struct FontStyleTagInfo {
55 const char *tag;
56 int tagLen;
57 GBool bold;
58 GBool italic;
59 };
60
61 // NB: these are compared, in order, against the tail of the font
62 // name, so "BoldItalic" must come before "Italic", etc.
63 static FontStyleTagInfo fontStyleTags[] = {
64 {"Roman", 5, gFalse, gFalse},
65 {"Regular", 7, gFalse, gFalse},
66 {"Condensed", 9, gFalse, gFalse},
67 {"CondensedBold", 13, gTrue, gFalse},
68 {"CondensedLight", 14, gFalse, gFalse},
69 {"SemiBold", 8, gTrue, gFalse},
70 {"BoldItalic", 10, gTrue, gTrue},
71 {"Bold_Italic", 11, gTrue, gTrue},
72 {"BoldOblique", 11, gTrue, gTrue},
73 {"Bold_Oblique", 12, gTrue, gTrue},
74 {"Bold", 4, gTrue, gFalse},
75 {"Italic", 6, gFalse, gTrue},
76 {"Oblique", 7, gFalse, gTrue},
77 {NULL, 0, gFalse, gFalse}
78 };
79
80 struct StandardFontInfo {
81 const char *name;
82 GBool fixedWidth;
83 GBool serif;
84 };
85
86 static StandardFontInfo standardFonts[] = {
87 {"Arial", gFalse, gFalse},
88 {"Courier", gTrue, gFalse},
89 {"Futura", gFalse, gFalse},
90 {"Helvetica", gFalse, gFalse},
91 {"Minion", gFalse, gTrue},
92 {"NewCenturySchlbk", gFalse, gTrue},
93 {"Times", gFalse, gTrue},
94 {"TimesNew", gFalse, gTrue},
95 {"Times_New", gFalse, gTrue},
96 {"Verdana", gFalse, gFalse},
97 {"LucidaSans", gFalse, gFalse},
98 {NULL, gFalse, gFalse}
99 };
100
101 struct SubstFontInfo {
102 double mWidth;
103 };
104
105 // index: {fixed:8, serif:4, sans-serif:0} + bold*2 + italic
106 static SubstFontInfo substFonts[16] = {
107 {0.833},
108 {0.833},
109 {0.889},
110 {0.889},
111 {0.788},
112 {0.722},
113 {0.833},
114 {0.778},
115 {0.600},
116 {0.600},
117 {0.600},
118 {0.600}
119 };
120
121 // Map Unicode indexes from the private use area, following the Adobe
122 // Glyph list.
123 #define privateUnicodeMapStart 0xf6f9
124 #define privateUnicodeMapEnd 0xf7ff
125 static int
126 privateUnicodeMap[privateUnicodeMapEnd - privateUnicodeMapStart + 1] = {
127 0x0141, 0x0152, 0, 0, 0x0160, 0, 0x017d, // f6f9
128 0, 0, 0, 0, 0, 0, 0, 0, // f700
129 0, 0, 0, 0, 0, 0, 0, 0,
130 0, 0, 0, 0, 0, 0, 0, 0, // f710
131 0, 0, 0, 0, 0, 0, 0, 0,
132 0, 0x0021, 0, 0, 0x0024, 0, 0x0026, 0, // f720
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, // f730
135 0x0038, 0x0039, 0, 0, 0, 0, 0, 0x003f,
136 0, 0, 0, 0, 0, 0, 0, 0, // f740
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0, // f750
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, // f760
141 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
142 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, // f770
143 0x0058, 0x0059, 0x005a, 0, 0, 0, 0, 0,
144 0, 0, 0, 0, 0, 0, 0, 0, // f780
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0, // f790
147 0, 0, 0, 0, 0, 0, 0, 0,
148 0, 0x00a1, 0x00a2, 0, 0, 0, 0, 0, // f7a0
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0, // f7b0
151 0, 0, 0, 0, 0, 0, 0, 0x00bf,
152 0, 0, 0, 0, 0, 0, 0, 0, // f7c0
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0, // f7d0
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, // f7e0
157 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
158 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0, // f7f0
159 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x0178
160 };
161
162 //------------------------------------------------------------------------
163
164 #if EVAL_MODE
165
166 #define EVAL_MODE_MSG "XpdfHTML evaluation - www.glyphandcog.com"
167
drawEvalModeMsg(SplashOutputDev * out)168 static void drawEvalModeMsg(SplashOutputDev *out) {
169 BuiltinFont *bf;
170 SplashFont *font;
171 GString *fontName;
172 char *msg;
173 SplashCoord mat[4], ident[6];
174 SplashCoord diag, size, textW, x, y;
175 Gushort cw;
176 int w, h, n, i;
177
178 // get the Helvetica font info
179 bf = builtinFontSubst[4];
180
181 msg = EVAL_MODE_MSG;
182 n = strlen(msg);
183
184 w = out->getBitmap()->getWidth();
185 h = out->getBitmap()->getHeight();
186
187 ident[0] = 1; ident[1] = 0;
188 ident[2] = 0; ident[3] = -1;
189 ident[4] = 0; ident[5] = h;
190 out->getSplash()->setMatrix(ident);
191
192 diag = splashSqrt((SplashCoord)(w*w + h*h));
193 size = diag / (0.67 * n);
194 if (size < 8) {
195 size = 8;
196 }
197 mat[0] = size * (SplashCoord)w / diag;
198 mat[3] = mat[0];
199 mat[1] = size * (SplashCoord)h / diag;
200 mat[2] = -mat[1];
201 fontName = new GString(bf->name);
202 font = out->getFont(fontName, mat);
203 delete fontName;
204 if (!font) {
205 return;
206 }
207
208 textW = 0;
209 for (i = 0; i < n; ++i) {
210 bf->widths->getWidth(winAnsiEncoding[msg[i] & 0xff], &cw);
211 textW += size * cw * 0.001;
212 }
213
214 out->setFillColor(255, 0, 0);
215 x = 0.5 * (diag - textW) * (SplashCoord)w / diag;
216 y = 0.5 * (diag - textW) * (SplashCoord)h / diag;
217 for (i = 0; i < n; ++i) {
218 out->getSplash()->fillChar(x, y, msg[i], font);
219 bf->widths->getWidth(winAnsiEncoding[msg[i] & 0xff], &cw);
220 x += mat[0] * cw * 0.001;
221 y += mat[1] * cw * 0.001;
222 }
223 }
224 #endif
225
226 //------------------------------------------------------------------------
227
HTMLGen(double backgroundResolutionA)228 HTMLGen::HTMLGen(double backgroundResolutionA) {
229 TextOutputControl textOutControl;
230 SplashColor paperColor;
231
232 ok = gTrue;
233
234 backgroundResolution = backgroundResolutionA;
235 drawInvisibleText = gTrue;
236
237 // set up the TextOutputDev
238 textOutControl.mode = textOutReadingOrder;
239 textOutControl.html = gTrue;
240 textOut = new TextOutputDev(NULL, &textOutControl, gFalse);
241 if (!textOut->isOk()) {
242 ok = gFalse;
243 }
244
245 // set up the SplashOutputDev
246 paperColor[0] = paperColor[1] = paperColor[2] = 0xff;
247 splashOut = new SplashOutputDev(splashModeRGB8, 1, gFalse, paperColor);
248 splashOut->setSkipText(gTrue, gFalse);
249 }
250
~HTMLGen()251 HTMLGen::~HTMLGen() {
252 delete textOut;
253 delete splashOut;
254 }
255
startDoc(PDFDoc * docA)256 void HTMLGen::startDoc(PDFDoc *docA) {
257 doc = docA;
258 splashOut->startDoc(doc->getXRef());
259 }
260
pr(int (* writeFunc)(void * stream,const char * data,int size),void * stream,const char * data)261 static inline int pr(int (*writeFunc)(void *stream, const char *data, int size),
262 void *stream, const char *data) {
263 return writeFunc(stream, data, (int)strlen(data));
264 }
265
pf(int (* writeFunc)(void * stream,const char * data,int size),void * stream,const char * fmt,...)266 static int pf(int (*writeFunc)(void *stream, const char *data, int size),
267 void *stream, const char *fmt, ...) {
268 va_list args;
269 GString *s;
270 int ret;
271
272 va_start(args, fmt);
273 s = GString::formatv(fmt, args);
274 va_end(args);
275 ret = writeFunc(stream, s->getCString(), s->getLength());
276 delete s;
277 return ret;
278 }
279
280 struct PNGWriteInfo {
281 int (*writePNG)(void *stream, const char *data, int size);
282 void *pngStream;
283 };
284
pngWriteFunc(png_structp png,png_bytep data,png_size_t size)285 static void pngWriteFunc(png_structp png, png_bytep data, png_size_t size) {
286 PNGWriteInfo *info;
287
288 info = (PNGWriteInfo *)png_get_progressive_ptr(png);
289 info->writePNG(info->pngStream, (char *)data, (int)size);
290 }
291
convertPage(int pg,const char * pngURL,int (* writeHTML)(void * stream,const char * data,int size),void * htmlStream,int (* writePNG)(void * stream,const char * data,int size),void * pngStream)292 int HTMLGen::convertPage(
293 int pg, const char *pngURL,
294 int (*writeHTML)(void *stream, const char *data, int size),
295 void *htmlStream,
296 int (*writePNG)(void *stream, const char *data, int size),
297 void *pngStream) {
298 png_structp png;
299 png_infop pngInfo;
300 PNGWriteInfo writeInfo;
301 SplashBitmap *bitmap;
302 Guchar *p;
303 double pageW, pageH;
304 TextPage *text;
305 GList *fonts, *cols, *pars, *lines, *words;
306 double *fontScales;
307 TextFontInfo *font;
308 TextColumn *col;
309 TextParagraph *par;
310 TextLine *line;
311 TextWord *word0, *word1;
312 GString *s;
313 double base, base1;
314 int subSuper0, subSuper1;
315 double r0, g0, b0, r1, g1, b1;
316 int colIdx, parIdx, lineIdx, wordIdx;
317 int y, i, u;
318
319 // generate the background bitmap
320 doc->displayPage(splashOut, pg, backgroundResolution, backgroundResolution,
321 0, gFalse, gTrue, gFalse);
322 #if EVAL_MODE
323 drawEvalModeMsg(splashOut);
324 #endif
325 bitmap = splashOut->getBitmap();
326 if (!(png = png_create_write_struct(PNG_LIBPNG_VER_STRING,
327 NULL, NULL, NULL)) ||
328 !(pngInfo = png_create_info_struct(png))) {
329 return errFileIO;
330 }
331 if (setjmp(png_jmpbuf(png))) {
332 return errFileIO;
333 }
334 writeInfo.writePNG = writePNG;
335 writeInfo.pngStream = pngStream;
336 png_set_write_fn(png, &writeInfo, pngWriteFunc, NULL);
337 png_set_IHDR(png, pngInfo, bitmap->getWidth(), bitmap->getHeight(),
338 8, PNG_COLOR_TYPE_RGB, PNG_INTERLACE_NONE,
339 PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT);
340 png_write_info(png, pngInfo);
341 p = bitmap->getDataPtr();
342 for (y = 0; y < bitmap->getHeight(); ++y) {
343 png_write_row(png, (png_bytep)p);
344 p += bitmap->getRowSize();
345 }
346 png_write_end(png, pngInfo);
347 png_destroy_write_struct(&png, &pngInfo);
348
349 // page size
350 pageW = doc->getPageCropWidth(pg);
351 pageH = doc->getPageCropHeight(pg);
352
353 // get the PDF text
354 doc->displayPage(textOut, pg, 72, 72, 0, gFalse, gTrue, gFalse);
355 doc->processLinks(textOut, pg);
356 text = textOut->takeText();
357
358 // HTML header
359 pr(writeHTML, htmlStream, "<html>\n");
360 pr(writeHTML, htmlStream, "<head>\n");
361 pr(writeHTML, htmlStream, "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">\n");
362 pr(writeHTML, htmlStream, "<style type=\"text/css\">\n");
363 pr(writeHTML, htmlStream, ".txt { white-space:nowrap; }\n");
364 fonts = text->getFonts();
365 fontScales = (double *)gmallocn(fonts->getLength(), sizeof(double));
366 for (i = 0; i < fonts->getLength(); ++i) {
367 font = (TextFontInfo *)fonts->get(i);
368 s = getFontDefn(font, &fontScales[i]);
369 pf(writeHTML, htmlStream, "#f{0:d} {{ {1:t} }}\n", i, s);
370 delete s;
371 }
372 pr(writeHTML, htmlStream, "</style>\n");
373 pr(writeHTML, htmlStream, "</head>\n");
374 pr(writeHTML, htmlStream, "<body onload=\"start()\">\n");
375 pf(writeHTML, htmlStream, "<img id=\"background\" style=\"position:absolute; left:0px; top:0px;\" width=\"{0:d}\" height=\"{1:d}\" src=\"{2:s}\">\n",
376 (int)pageW, (int)pageH, pngURL);
377
378 // generate the HTML text
379 cols = text->makeColumns();
380 for (colIdx = 0; colIdx < cols->getLength(); ++colIdx) {
381 col = (TextColumn *)cols->get(colIdx);
382 pars = col->getParagraphs();
383 for (parIdx = 0; parIdx < pars->getLength(); ++parIdx) {
384 par = (TextParagraph *)pars->get(parIdx);
385 lines = par->getLines();
386 for (lineIdx = 0; lineIdx < lines->getLength(); ++lineIdx) {
387 line = (TextLine *)lines->get(lineIdx);
388 if (line->getRotation() != 0) {
389 continue;
390 }
391 words = line->getWords();
392 base = line->getBaseline();
393 s = new GString();
394 word0 = NULL;
395 subSuper0 = 0; // make gcc happy
396 r0 = g0 = b0 = 0; // make gcc happy
397 for (wordIdx = 0; wordIdx < words->getLength(); ++wordIdx) {
398 word1 = (TextWord *)words->get(wordIdx);
399 if (!drawInvisibleText && word1->isInvisible()) {
400 continue;
401 }
402 word1->getColor(&r1, &g1, &b1);
403 base1 = word1->getBaseline();
404 if (base1 - base < -1) {
405 subSuper1 = -1; // superscript
406 } else if (base1 - base > 1) {
407 subSuper1 = 1; // subscript
408 } else {
409 subSuper1 = 0;
410 }
411 if (!word0 ||
412 word1->getFontInfo() != word0->getFontInfo() ||
413 word1->getFontSize() != word0->getFontSize() ||
414 subSuper1 != subSuper0 ||
415 r1 != r0 || g1 != g0 || b1 != b0) {
416 if (word0) {
417 s->append("</span>");
418 }
419 for (i = 0; i < fonts->getLength(); ++i) {
420 if (word1->getFontInfo() == (TextFontInfo *)fonts->get(i)) {
421 break;
422 }
423 }
424 s->appendf("<span id=\"f{0:d}\" style=\"font-size:{1:d}px;vertical-align:{2:s};color:#{3:02x}{4:02x}{5:02x};\">",
425 i, (int)(fontScales[i] * word1->getFontSize()),
426 subSuper1 < 0 ? "super"
427 : subSuper1 > 0 ? "sub"
428 : "baseline",
429 (int)(r1 * 255), (int)(g1 * 255), (int)(b1 * 255));
430 }
431 for (i = 0; i < word1->getLength(); ++i) {
432 u = word1->getChar(i);
433 if (u >= privateUnicodeMapStart &&
434 u <= privateUnicodeMapEnd &&
435 privateUnicodeMap[u - privateUnicodeMapStart]) {
436 u = privateUnicodeMap[u - privateUnicodeMapStart];
437 }
438 if (u <= 0x7f) {
439 if (u == '&') {
440 s->append("&");
441 } else if (u == '<') {
442 s->append("<");
443 } else if (u == '>') {
444 s->append(">");
445 } else {
446 s->append((char)u);
447 }
448 } else if (u <= 0x7ff) {
449 s->append((char)(0xc0 + (u >> 6)));
450 s->append((char)(0x80 + (u & 0x3f)));
451 } else if (u <= 0xffff) {
452 s->append((char)0xe0 + (u >> 12));
453 s->append((char)0x80 + ((u >> 6) & 0x3f));
454 s->append((char)0x80 + (u & 0x3f));
455 } else if (u <= 0x1fffff) {
456 s->append((char)0xf0 + (u >> 18));
457 s->append((char)0x80 + ((u >> 12) & 0x3f));
458 s->append((char)0x80 + ((u >> 6) & 0x3f));
459 s->append((char)0x80 + (u & 0x3f));
460 } else if (u <= 0x3ffffff) {
461 s->append((char)0xf8 + (u >> 24));
462 s->append((char)0x80 + ((u >> 18) & 0x3f));
463 s->append((char)0x80 + ((u >> 12) & 0x3f));
464 s->append((char)0x80 + ((u >> 6) & 0x3f));
465 s->append((char)0x80 + (u & 0x3f));
466 } else if (u <= 0x7fffffff) {
467 s->append((char)0xfc + (u >> 30));
468 s->append((char)0x80 + ((u >> 24) & 0x3f));
469 s->append((char)0x80 + ((u >> 18) & 0x3f));
470 s->append((char)0x80 + ((u >> 12) & 0x3f));
471 s->append((char)0x80 + ((u >> 6) & 0x3f));
472 s->append((char)0x80 + (u & 0x3f));
473 }
474 }
475 if (word1->getSpaceAfter()) {
476 s->append(' ');
477 }
478 word0 = word1;
479 subSuper0 = subSuper1;
480 r0 = r1;
481 g0 = g1;
482 b0 = b1;
483 }
484 s->append("</span>");
485 pf(writeHTML, htmlStream, "<div class=\"txt\" style=\"position:absolute; left:{0:d}px; top:{1:d}px;\">{2:t}</div>\n",
486 (int)line->getXMin(), (int)line->getYMin(), s);
487 delete s;
488 }
489 }
490 }
491 gfree(fontScales);
492 delete text;
493 deleteGList(cols, TextColumn);
494
495 // HTML trailer
496 pr(writeHTML, htmlStream, "</body>\n");
497 pr(writeHTML, htmlStream, "</html>\n");
498
499 return errNone;
500 }
501
getFontDefn(TextFontInfo * font,double * scale)502 GString *HTMLGen::getFontDefn(TextFontInfo *font, double *scale) {
503 GString *fontName;
504 char *fontName2;
505 FontStyleTagInfo *fst;
506 StandardFontInfo *sf;
507 GBool fixedWidth, serif, bold, italic;
508 double s;
509 int n, i;
510
511 // get the font name, remove any subset tag
512 fontName = font->getFontName();
513 if (fontName) {
514 fontName2 = fontName->getCString();
515 n = fontName->getLength();
516 for (i = 0; i < n && i < 7; ++i) {
517 if (fontName2[i] < 'A' || fontName2[i] > 'Z') {
518 break;
519 }
520 }
521 if (i == 6 && n > 7 && fontName2[6] == '+') {
522 fontName2 += 7;
523 n -= 7;
524 }
525 } else {
526 fontName2 = NULL;
527 n = 0;
528 }
529
530 // get the style info from the font descriptor flags
531 fixedWidth = font->isFixedWidth();
532 serif = font->isSerif();
533 bold = font->isBold();
534 italic = font->isItalic();
535
536 if (fontName2) {
537
538 // look for a style tag at the end of the font name -- this
539 // overrides the font descriptor bold/italic flags
540 for (fst = fontStyleTags; fst->tag; ++fst) {
541 if (n > fst->tagLen &&
542 !strcasecmp(fontName2 + n - fst->tagLen, fst->tag)) {
543 bold = fst->bold;
544 italic = fst->italic;
545 n -= fst->tagLen;
546 if (n > 1 && (fontName2[n-1] == '-' ||
547 fontName2[n-1] == ',' ||
548 fontName2[n-1] == '.' ||
549 fontName2[n-1] == '_')) {
550 --n;
551 }
552 break;
553 }
554 }
555
556 // look for a known font name -- this overrides the font descriptor
557 // fixedWidth/serif flags
558 for (sf = standardFonts; sf->name; ++sf) {
559 if (!strncasecmp(fontName2, sf->name, n)) {
560 fixedWidth = sf->fixedWidth;
561 serif = sf->serif;
562 break;
563 }
564 }
565 }
566
567 // compute the scaling factor
568 *scale = 1;
569 if ((s = font->getMWidth())) {
570 i = (fixedWidth ? 8 : serif ? 4 : 0) + (bold ? 2 : 0) + (italic ? 1 : 0);
571 if (s < substFonts[i].mWidth) {
572 *scale = s / substFonts[i].mWidth;
573 }
574 }
575
576 // generate the CSS markup
577 return GString::format("font-family:{0:s}; font-weight:{1:s}; font-style:{2:s};",
578 fixedWidth ? "monospace"
579 : serif ? "serif"
580 : "sans-serif",
581 bold ? "bold" : "normal",
582 italic ? "italic" : "normal");
583 }
584