1 #include "mupdf/fitz.h"
2 #include "mupdf/ucdn.h"
3 #include "html-imp.h"
4
5 #include <string.h>
6 #include <stdio.h>
7 #include <assert.h>
8
9 enum { T, R, B, L };
10
11 #define DEFAULT_DIR FZ_BIDI_LTR
12
13 static const char *html_default_css =
14 "@page{margin:3em 2em}"
15 "a{color:#06C;text-decoration:underline}"
16 "address{display:block;font-style:italic}"
17 "b{font-weight:bold}"
18 "bdo{direction:rtl;unicode-bidi:bidi-override}"
19 "blockquote{display:block;margin:1em 40px}"
20 "body{display:block;margin:1em}"
21 "cite{font-style:italic}"
22 "code{font-family:monospace}"
23 "dd{display:block;margin:0 0 0 40px}"
24 "del{text-decoration:line-through}"
25 "div{display:block}"
26 "dl{display:block;margin:1em 0}"
27 "dt{display:block}"
28 "em{font-style:italic}"
29 "h1{display:block;font-size:2em;font-weight:bold;margin:0.67em 0;page-break-after:avoid}"
30 "h2{display:block;font-size:1.5em;font-weight:bold;margin:0.83em 0;page-break-after:avoid}"
31 "h3{display:block;font-size:1.17em;font-weight:bold;margin:1em 0;page-break-after:avoid}"
32 "h4{display:block;font-size:1em;font-weight:bold;margin:1.33em 0;page-break-after:avoid}"
33 "h5{display:block;font-size:0.83em;font-weight:bold;margin:1.67em 0;page-break-after:avoid}"
34 "h6{display:block;font-size:0.67em;font-weight:bold;margin:2.33em 0;page-break-after:avoid}"
35 "head{display:none}"
36 "hr{border-style:solid;border-width:1px;display:block;margin-bottom:0.5em;margin-top:0.5em;text-align:center}"
37 "html{display:block}"
38 "i{font-style:italic}"
39 "ins{text-decoration:underline}"
40 "kbd{font-family:monospace}"
41 "li{display:list-item}"
42 "menu{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}"
43 "ol{display:block;list-style-type:decimal;margin:1em 0;padding:0 0 0 30pt}"
44 "p{display:block;margin:1em 0}"
45 "pre{display:block;font-family:monospace;margin:1em 0;white-space:pre}"
46 "samp{font-family:monospace}"
47 "script{display:none}"
48 "small{font-size:0.83em}"
49 "strong{font-weight:bold}"
50 "style{display:none}"
51 "sub{font-size:0.83em;vertical-align:sub}"
52 "sup{font-size:0.83em;vertical-align:super}"
53 "table{display:table}"
54 "tbody{display:table-row-group}"
55 "td{display:table-cell;padding:1px}"
56 "tfoot{display:table-footer-group}"
57 "th{display:table-cell;font-weight:bold;padding:1px;text-align:center}"
58 "thead{display:table-header-group}"
59 "tr{display:table-row}"
60 "ul{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}"
61 "ul ul{list-style-type:circle}"
62 "ul ul ul{list-style-type:square}"
63 "var{font-style:italic}"
64 "svg{display:none}"
65 ;
66
67 static const char *fb2_default_css =
68 "@page{margin:3em 2em}"
69 "FictionBook{display:block;margin:1em}"
70 "stylesheet,binary{display:none}"
71 "description>*{display:none}"
72 "description>title-info{display:block}"
73 "description>title-info>*{display:none}"
74 "description>title-info>coverpage{display:block;page-break-before:always;page-break-after:always}"
75 "body,section,title,subtitle,p,cite,epigraph,text-author,date,poem,stanza,v,empty-line{display:block}"
76 "image{display:block}"
77 "p>image{display:inline}"
78 "table{display:table}"
79 "tr{display:table-row}"
80 "th,td{display:table-cell}"
81 "a{color:#06C;text-decoration:underline}"
82 "a[type=note]{font-size:small;vertical-align:super}"
83 "code{white-space:pre;font-family:monospace}"
84 "emphasis{font-style:italic}"
85 "strikethrough{text-decoration:line-through}"
86 "strong{font-weight:bold}"
87 "sub{font-size:small;vertical-align:sub}"
88 "sup{font-size:small;vertical-align:super}"
89 "image{margin:1em 0;text-align:center}"
90 "cite,poem{margin:1em 2em}"
91 "subtitle,epigraph,stanza{margin:1em 0}"
92 "title>p{text-align:center;font-size:x-large}"
93 "subtitle{text-align:center;font-size:large}"
94 "p{margin-top:1em;text-align:justify}"
95 "empty-line{padding-top:1em}"
96 "p+p{margin-top:0;text-indent:1.5em}"
97 "empty-line+p{margin-top:0}"
98 "section>title{page-break-before:always}"
99 ;
100
101 struct genstate
102 {
103 fz_pool *pool;
104 fz_html_font_set *set;
105 fz_archive *zip;
106 fz_tree *images;
107 int is_fb2;
108 const char *base_uri;
109 fz_css *css;
110 int at_bol;
111 int emit_white;
112 int last_brk_cls;
113 fz_css_style_splay *styles;
114 };
115
iswhite(int c)116 static int iswhite(int c)
117 {
118 return c == ' ' || c == '\t' || c == '\r' || c == '\n';
119 }
120
is_all_white(const char * s)121 static int is_all_white(const char *s)
122 {
123 while (*s)
124 {
125 if (!iswhite(*s))
126 return 0;
127 ++s;
128 }
129 return 1;
130 }
131
132 /* TODO: pool allocator for flow nodes */
133 /* TODO: store text by pointing to a giant buffer */
134
fz_drop_html_flow(fz_context * ctx,fz_html_flow * flow)135 static void fz_drop_html_flow(fz_context *ctx, fz_html_flow *flow)
136 {
137 while (flow)
138 {
139 fz_html_flow *next = flow->next;
140 if (flow->type == FLOW_IMAGE)
141 fz_drop_image(ctx, flow->content.image);
142 flow = next;
143 }
144 }
145
add_flow(fz_context * ctx,fz_pool * pool,fz_html_box * top,fz_html_box * inline_box,int type,int extras)146 static fz_html_flow *add_flow(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, int type, int extras)
147 {
148 size_t size = (type == FLOW_IMAGE ? sizeof(fz_html_flow) : offsetof(fz_html_flow, content) + extras);
149 fz_html_flow *flow = fz_pool_alloc(ctx, pool, size);
150 flow->type = type;
151 flow->expand = 0;
152 flow->bidi_level = 0;
153 flow->markup_lang = 0;
154 flow->breaks_line = 0;
155 flow->box = inline_box;
156 *top->flow_tail = flow;
157 top->flow_tail = &flow->next;
158 return flow;
159 }
160
add_flow_space(fz_context * ctx,fz_pool * pool,fz_html_box * top,fz_html_box * inline_box)161 static void add_flow_space(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
162 {
163 fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_SPACE, 0);
164 flow->expand = 1;
165 }
166
add_flow_break(fz_context * ctx,fz_pool * pool,fz_html_box * top,fz_html_box * inline_box)167 static void add_flow_break(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
168 {
169 (void)add_flow(ctx, pool, top, inline_box, FLOW_BREAK, 0);
170 }
171
add_flow_sbreak(fz_context * ctx,fz_pool * pool,fz_html_box * top,fz_html_box * inline_box)172 static void add_flow_sbreak(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
173 {
174 (void)add_flow(ctx, pool, top, inline_box, FLOW_SBREAK, 0);
175 }
176
add_flow_shyphen(fz_context * ctx,fz_pool * pool,fz_html_box * top,fz_html_box * inline_box)177 static void add_flow_shyphen(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
178 {
179 (void)add_flow(ctx, pool, top, inline_box, FLOW_SHYPHEN, 0);
180 }
181
add_flow_word(fz_context * ctx,fz_pool * pool,fz_html_box * top,fz_html_box * inline_box,const char * a,const char * b,int lang)182 static void add_flow_word(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, const char *a, const char *b, int lang)
183 {
184 fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_WORD, b - a + 1);
185 memcpy(flow->content.text, a, b - a);
186 flow->content.text[b - a] = 0;
187 flow->markup_lang = lang;
188 }
189
add_flow_image(fz_context * ctx,fz_pool * pool,fz_html_box * top,fz_html_box * inline_box,fz_image * img)190 static void add_flow_image(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, fz_image *img)
191 {
192 fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_IMAGE, 0);
193 flow->content.image = fz_keep_image(ctx, img);
194 }
195
add_flow_anchor(fz_context * ctx,fz_pool * pool,fz_html_box * top,fz_html_box * inline_box)196 static void add_flow_anchor(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
197 {
198 (void)add_flow(ctx, pool, top, inline_box, FLOW_ANCHOR, 0);
199 }
200
split_flow(fz_context * ctx,fz_pool * pool,fz_html_flow * flow,size_t offset)201 static fz_html_flow *split_flow(fz_context *ctx, fz_pool *pool, fz_html_flow *flow, size_t offset)
202 {
203 fz_html_flow *new_flow;
204 char *text;
205 size_t len;
206
207 assert(flow->type == FLOW_WORD);
208
209 if (offset == 0)
210 return flow;
211 text = flow->content.text;
212 while (*text && offset)
213 {
214 int rune;
215 text += fz_chartorune(&rune, text);
216 offset--;
217 }
218 len = strlen(text);
219 new_flow = fz_pool_alloc(ctx, pool, offsetof(fz_html_flow, content) + len+1);
220 memcpy(new_flow, flow, offsetof(fz_html_flow, content));
221 new_flow->next = flow->next;
222 flow->next = new_flow;
223 strcpy(new_flow->content.text, text);
224 *text = 0;
225 return new_flow;
226 }
227
flush_space(fz_context * ctx,fz_html_box * flow,fz_html_box * inline_box,int lang,struct genstate * g)228 static void flush_space(fz_context *ctx, fz_html_box *flow, fz_html_box *inline_box, int lang, struct genstate *g)
229 {
230 static const char *space = " ";
231 int bsp = inline_box->style->white_space & WS_ALLOW_BREAK_SPACE;
232 fz_pool *pool = g->pool;
233 if (g->emit_white)
234 {
235 if (!g->at_bol)
236 {
237 if (bsp)
238 add_flow_space(ctx, pool, flow, inline_box);
239 else
240 add_flow_word(ctx, pool, flow, inline_box, space, space+1, lang);
241 }
242 g->emit_white = 0;
243 }
244 }
245
246 /* pair-wise lookup table for UAX#14 linebreaks */
247 static const char *pairbrk[29] =
248 {
249 /* -OCCQGNESIPPNAHIIHBBBZCWHHJJJR- */
250 /* -PLPULSXYSROULLDNYAB2WMJ23LVTI- */
251 "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^", /* OP open punctuation */
252 "_^^%%^^^^%%_____%%__^^^______", /* CL close punctuation */
253 "_^^%%^^^^%%%%%__%%__^^^______", /* CP close parenthesis */
254 "^^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* QU quotation */
255 "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* GL non-breaking glue */
256 "_^^%%%^^^_______%%__^^^______", /* NS nonstarters */
257 "_^^%%%^^^______%%%__^^^______", /* EX exclamation/interrogation */
258 "_^^%%%^^^__%_%__%%__^^^______", /* SY symbols allowing break after */
259 "_^^%%%^^^__%%%__%%__^^^______", /* IS infix numeric separator */
260 "%^^%%%^^^__%%%%_%%__^^^%%%%%_", /* PR prefix numeric */
261 "%^^%%%^^^__%%%__%%__^^^______", /* PO postfix numeric */
262 "%^^%%%^^^%%%%%_%%%__^^^______", /* NU numeric */
263 "%^^%%%^^^__%%%_%%%__^^^______", /* AL ordinary alphabetic and symbol characters */
264 "%^^%%%^^^__%%%_%%%__^^^______", /* HL hebrew letter */
265 "_^^%%%^^^_%____%%%__^^^______", /* ID ideographic */
266 "_^^%%%^^^______%%%__^^^______", /* IN inseparable characters */
267 "_^^%_%^^^__%____%%__^^^______", /* HY hyphens */
268 "_^^%_%^^^_______%%__^^^______", /* BA break after */
269 "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* BB break before */
270 "_^^%%%^^^_______%%_^^^^______", /* B2 break opportunity before and after */
271 "____________________^________", /* ZW zero width space */
272 "%^^%%%^^^__%%%_%%%__^^^______", /* CM combining mark */
273 "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* WJ word joiner */
274 "_^^%%%^^^_%____%%%__^^^___%%_", /* H2 hangul leading/vowel syllable */
275 "_^^%%%^^^_%____%%%__^^^____%_", /* H3 hangul leading/vowel/trailing syllable */
276 "_^^%%%^^^_%____%%%__^^^%%%%__", /* JL hangul leading jamo */
277 "_^^%%%^^^_%____%%%__^^^___%%_", /* JV hangul vowel jamo */
278 "_^^%%%^^^_%____%%%__^^^____%_", /* JT hangul trailing jamo */
279 "_^^%%%^^^_______%%__^^^_____%", /* RI regional indicator */
280 };
281
generate_text(fz_context * ctx,fz_html_box * box,const char * text,int lang,struct genstate * g)282 static void generate_text(fz_context *ctx, fz_html_box *box, const char *text, int lang, struct genstate *g)
283 {
284 fz_html_box *flow;
285 fz_pool *pool = g->pool;
286 int collapse = box->style->white_space & WS_COLLAPSE;
287 int bsp = box->style->white_space & WS_ALLOW_BREAK_SPACE;
288 int bnl = box->style->white_space & WS_FORCE_BREAK_NEWLINE;
289
290 static const char *space = " ";
291
292 flow = box;
293 while (flow->type != BOX_FLOW)
294 flow = flow->up;
295
296 while (*text)
297 {
298 if (bnl && (*text == '\n' || *text == '\r'))
299 {
300 if (text[0] == '\r' && text[1] == '\n')
301 text += 2;
302 else
303 text += 1;
304 add_flow_break(ctx, pool, flow, box);
305 g->at_bol = 1;
306 }
307 else if (iswhite(*text))
308 {
309 if (collapse)
310 {
311 if (bnl)
312 while (*text == ' ' || *text == '\t')
313 ++text;
314 else
315 while (iswhite(*text))
316 ++text;
317 g->emit_white = 1;
318 }
319 else
320 {
321 // TODO: tabs
322 if (bsp)
323 add_flow_space(ctx, pool, flow, box);
324 else
325 add_flow_word(ctx, pool, flow, box, space, space+1, lang);
326 ++text;
327 }
328 g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a space */
329 }
330 else
331 {
332 const char *prev, *mark = text;
333 int c;
334
335 flush_space(ctx, flow, box, lang, g);
336
337 if (g->at_bol)
338 g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ;
339
340 while (*text && !iswhite(*text))
341 {
342 prev = text;
343 text += fz_chartorune(&c, text);
344 if (c == 0xAD) /* soft hyphen */
345 {
346 if (mark != prev)
347 add_flow_word(ctx, pool, flow, box, mark, prev, lang);
348 add_flow_shyphen(ctx, pool, flow, box);
349 mark = text;
350 g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a soft hyphen */
351 }
352 else if (bsp) /* allow soft breaks */
353 {
354 int this_brk_cls = ucdn_get_resolved_linebreak_class(c);
355 if (this_brk_cls < UCDN_LINEBREAK_CLASS_RI)
356 {
357 int brk = pairbrk[g->last_brk_cls][this_brk_cls];
358
359 /* we handle spaces elsewhere, so ignore these classes */
360 if (brk == '@') brk = '^';
361 if (brk == '#') brk = '^';
362 if (brk == '%') brk = '^';
363
364 if (brk == '_')
365 {
366 if (mark != prev)
367 add_flow_word(ctx, pool, flow, box, mark, prev, lang);
368 add_flow_sbreak(ctx, pool, flow, box);
369 mark = prev;
370 }
371
372 g->last_brk_cls = this_brk_cls;
373 }
374 }
375 }
376 if (mark != text)
377 add_flow_word(ctx, pool, flow, box, mark, text, lang);
378
379 g->at_bol = 0;
380 }
381 }
382 }
383
load_html_image(fz_context * ctx,fz_archive * zip,const char * base_uri,const char * src)384 static fz_image *load_html_image(fz_context *ctx, fz_archive *zip, const char *base_uri, const char *src)
385 {
386 char path[2048];
387 fz_image *img = NULL;
388 fz_buffer *buf = NULL;
389
390 fz_var(img);
391 fz_var(buf);
392
393 fz_try(ctx)
394 {
395 if (!strncmp(src, "data:image/jpeg;base64,", 23))
396 buf = fz_new_buffer_from_base64(ctx, src+23, 0);
397 else if (!strncmp(src, "data:image/png;base64,", 22))
398 buf = fz_new_buffer_from_base64(ctx, src+22, 0);
399 else if (!strncmp(src, "data:image/gif;base64,", 22))
400 buf = fz_new_buffer_from_base64(ctx, src+22, 0);
401 else
402 {
403 fz_strlcpy(path, base_uri, sizeof path);
404 fz_strlcat(path, "/", sizeof path);
405 fz_strlcat(path, src, sizeof path);
406 fz_urldecode(path);
407 fz_cleanname(path);
408 buf = fz_read_archive_entry(ctx, zip, path);
409 }
410 #if FZ_ENABLE_SVG
411 if (strstr(src, ".svg"))
412 img = fz_new_image_from_svg(ctx, buf, base_uri, zip);
413 else
414 #endif
415 img = fz_new_image_from_buffer(ctx, buf);
416 }
417 fz_always(ctx)
418 fz_drop_buffer(ctx, buf);
419 fz_catch(ctx)
420 fz_warn(ctx, "html: cannot load image src='%s'", src);
421
422 return img;
423 }
424
load_svg_image(fz_context * ctx,fz_archive * zip,const char * base_uri,fz_xml * xml)425 static fz_image *load_svg_image(fz_context *ctx, fz_archive *zip, const char *base_uri, fz_xml *xml)
426 {
427 fz_image *img = NULL;
428 fz_try(ctx)
429 img = fz_new_image_from_svg_xml(ctx, xml, base_uri, zip);
430 fz_catch(ctx)
431 fz_warn(ctx, "html: cannot load embedded svg document");
432 return img;
433 }
434
generate_anchor(fz_context * ctx,fz_html_box * box,struct genstate * g)435 static void generate_anchor(fz_context *ctx, fz_html_box *box, struct genstate *g)
436 {
437 fz_pool *pool = g->pool;
438 fz_html_box *flow = box;
439 while (flow->type != BOX_FLOW)
440 flow = flow->up;
441 add_flow_anchor(ctx, pool, flow, box);
442 }
443
generate_image(fz_context * ctx,fz_html_box * box,fz_image * img,struct genstate * g)444 static void generate_image(fz_context *ctx, fz_html_box *box, fz_image *img, struct genstate *g)
445 {
446 fz_html_box *flow = box;
447 fz_pool *pool = g->pool;
448 while (flow->type != BOX_FLOW)
449 flow = flow->up;
450
451 flush_space(ctx, flow, box, 0, g);
452
453 if (!img)
454 {
455 const char *alt = "[image]";
456 add_flow_word(ctx, pool, flow, box, alt, alt + 7, 0);
457 }
458 else
459 {
460 fz_try(ctx)
461 {
462 add_flow_sbreak(ctx, pool, flow, box);
463 add_flow_image(ctx, pool, flow, box, img);
464 add_flow_sbreak(ctx, pool, flow, box);
465 }
466 fz_always(ctx)
467 {
468 fz_drop_image(ctx, img);
469 }
470 fz_catch(ctx)
471 fz_rethrow(ctx);
472 }
473
474 g->at_bol = 0;
475 }
476
init_box(fz_context * ctx,fz_html_box * box,fz_bidi_direction markup_dir)477 static void init_box(fz_context *ctx, fz_html_box *box, fz_bidi_direction markup_dir)
478 {
479 box->type = BOX_BLOCK;
480 box->x = box->y = 0;
481 box->w = box->b = 0;
482
483 box->up = NULL;
484 box->down = NULL;
485 box->next = NULL;
486
487 box->flow_head = NULL;
488 box->flow_tail = &box->flow_head;
489 box->markup_dir = markup_dir;
490 box->style = NULL;
491 }
492
fz_drop_html_box(fz_context * ctx,fz_html_box * box)493 static void fz_drop_html_box(fz_context *ctx, fz_html_box *box)
494 {
495 while (box)
496 {
497 fz_html_box *next = box->next;
498 fz_drop_html_flow(ctx, box->flow_head);
499 fz_drop_html_box(ctx, box->down);
500 box = next;
501 }
502 }
503
fz_drop_html_imp(fz_context * ctx,fz_storable * stor)504 static void fz_drop_html_imp(fz_context *ctx, fz_storable *stor)
505 {
506 fz_html *html = (fz_html *)stor;
507 fz_drop_html_box(ctx, html->root);
508 fz_drop_pool(ctx, html->pool);
509 }
510
fz_drop_html(fz_context * ctx,fz_html * html)511 void fz_drop_html(fz_context *ctx, fz_html *html)
512 {
513 fz_defer_reap_start(ctx);
514 fz_drop_storable(ctx, &html->storable);
515 fz_defer_reap_end(ctx);
516 }
517
fz_keep_html(fz_context * ctx,fz_html * html)518 fz_html *fz_keep_html(fz_context *ctx, fz_html *html)
519 {
520 return fz_keep_storable(ctx, &html->storable);
521 }
522
new_box(fz_context * ctx,fz_pool * pool,fz_bidi_direction markup_dir)523 static fz_html_box *new_box(fz_context *ctx, fz_pool *pool, fz_bidi_direction markup_dir)
524 {
525 fz_html_box *box = fz_pool_alloc(ctx, pool, sizeof *box);
526 init_box(ctx, box, markup_dir);
527 return box;
528 }
529
new_short_box(fz_context * ctx,fz_pool * pool,fz_bidi_direction markup_dir)530 static fz_html_box *new_short_box(fz_context *ctx, fz_pool *pool, fz_bidi_direction markup_dir)
531 {
532 fz_html_box *box = fz_pool_alloc(ctx, pool, offsetof(fz_html_box, padding));
533 init_box(ctx, box, markup_dir);
534 return box;
535 }
536
insert_box(fz_context * ctx,fz_html_box * box,int type,fz_html_box * top)537 static void insert_box(fz_context *ctx, fz_html_box *box, int type, fz_html_box *top)
538 {
539 box->type = type;
540
541 box->up = top;
542
543 if (top)
544 {
545 /* Here 'next' really means 'last of my children'. This will
546 * be fixed up in a pass at the end of parsing. */
547 if (!top->next)
548 {
549 top->down = top->next = box;
550 }
551 else
552 {
553 top->next->next = box;
554 /* Here next actually means next */
555 top->next = box;
556 }
557 }
558 }
559
insert_block_box(fz_context * ctx,fz_html_box * box,fz_html_box * top)560 static fz_html_box *insert_block_box(fz_context *ctx, fz_html_box *box, fz_html_box *top)
561 {
562 if (top->type == BOX_BLOCK)
563 {
564 insert_box(ctx, box, BOX_BLOCK, top);
565 }
566 else if (top->type == BOX_FLOW)
567 {
568 while (top->type != BOX_BLOCK)
569 top = top->up;
570 insert_box(ctx, box, BOX_BLOCK, top);
571 }
572 else if (top->type == BOX_INLINE)
573 {
574 while (top->type != BOX_BLOCK)
575 top = top->up;
576 insert_box(ctx, box, BOX_BLOCK, top);
577 }
578 return top;
579 }
580
insert_table_box(fz_context * ctx,fz_html_box * box,fz_html_box * top)581 static fz_html_box *insert_table_box(fz_context *ctx, fz_html_box *box, fz_html_box *top)
582 {
583 top = insert_block_box(ctx, box, top);
584 box->type = BOX_TABLE;
585 return top;
586 }
587
insert_table_row_box(fz_context * ctx,fz_html_box * box,fz_html_box * top)588 static fz_html_box *insert_table_row_box(fz_context *ctx, fz_html_box *box, fz_html_box *top)
589 {
590 fz_html_box *table = top;
591 while (table && table->type != BOX_TABLE)
592 table = table->up;
593 if (table)
594 {
595 insert_box(ctx, box, BOX_TABLE_ROW, table);
596 return table;
597 }
598 fz_warn(ctx, "table-row not inside table element");
599 insert_block_box(ctx, box, top);
600 return top;
601 }
602
insert_table_cell_box(fz_context * ctx,fz_html_box * box,fz_html_box * top)603 static fz_html_box *insert_table_cell_box(fz_context *ctx, fz_html_box *box, fz_html_box *top)
604 {
605 fz_html_box *tr = top;
606 while (tr && tr->type != BOX_TABLE_ROW)
607 tr = tr->up;
608 if (tr)
609 {
610 insert_box(ctx, box, BOX_TABLE_CELL, tr);
611 return tr;
612 }
613 fz_warn(ctx, "table-cell not inside table-row element");
614 insert_block_box(ctx, box, top);
615 return top;
616 }
617
insert_inline_box(fz_context * ctx,fz_html_box * box,fz_html_box * top,int markup_dir,struct genstate * g)618 static void insert_inline_box(fz_context *ctx, fz_html_box *box, fz_html_box *top, int markup_dir, struct genstate *g)
619 {
620 if (top->type == BOX_FLOW || top->type == BOX_INLINE)
621 {
622 insert_box(ctx, box, BOX_INLINE, top);
623 }
624 else
625 {
626 while (top->type != BOX_BLOCK && top->type != BOX_TABLE_CELL)
627 top = top->up;
628
629 /* Here 'next' actually means 'last of my children' */
630 if (top->next && top->next->type == BOX_FLOW)
631 {
632 insert_box(ctx, box, BOX_INLINE, top->next);
633 }
634 else
635 {
636 fz_css_style style;
637 fz_html_box *flow = new_short_box(ctx, g->pool, markup_dir);
638 flow->is_first_flow = !top->next;
639 fz_default_css_style(ctx, &style);
640 flow->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
641 insert_box(ctx, flow, BOX_FLOW, top);
642 insert_box(ctx, box, BOX_INLINE, flow);
643 g->at_bol = 1;
644 }
645 }
646 }
647
648 static fz_html_box *
generate_boxes(fz_context * ctx,fz_xml * node,fz_html_box * top,fz_css_match * up_match,int list_counter,int section_depth,int markup_dir,int markup_lang,struct genstate * g)649 generate_boxes(fz_context *ctx,
650 fz_xml *node,
651 fz_html_box *top,
652 fz_css_match *up_match,
653 int list_counter,
654 int section_depth,
655 int markup_dir,
656 int markup_lang,
657 struct genstate *g)
658 {
659 fz_html_box *box, *last_top;
660 const char *tag;
661 int display;
662 fz_css_style style;
663
664 while (node)
665 {
666
667 tag = fz_xml_tag(node);
668 if (tag)
669 {
670 fz_css_match match;
671
672 fz_match_css(ctx, &match, up_match, g->css, node);
673
674 display = fz_get_css_match_display(&match);
675
676 fz_apply_css_style(ctx, g->set, &style, &match);
677
678 if (tag[0]=='b' && tag[1]=='r' && tag[2]==0)
679 {
680 fz_html_box *flow;
681 if (top->type != BOX_INLINE)
682 {
683 /* Create anonymous inline box, with the same style as the top block box. */
684 fz_css_style style;
685 box = new_short_box(ctx, g->pool, markup_dir);
686 fz_default_css_style(ctx, &style);
687 box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
688 insert_inline_box(ctx, box, top, markup_dir, g);
689 style = *top->style;
690 /* Make sure not to recursively multiply font sizes. */
691 style.font_size.value = 1;
692 style.font_size.unit = N_SCALE;
693 box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
694 flow = box;
695 while (flow->type != BOX_FLOW)
696 flow = flow->up;
697 add_flow_break(ctx, g->pool, flow, box);
698 }
699 else
700 {
701 flow = top;
702 while (flow->type != BOX_FLOW)
703 flow = flow->up;
704 add_flow_break(ctx, g->pool, flow, top);
705 }
706 g->at_bol = 1;
707 }
708
709 else if (tag[0]=='i' && tag[1]=='m' && tag[2]=='g' && tag[3]==0)
710 {
711 const char *src = fz_xml_att(node, "src");
712 if (src)
713 {
714 int w, h;
715 const char *w_att = fz_xml_att(node, "width");
716 const char *h_att = fz_xml_att(node, "height");
717 if (w_att && (w = fz_atoi(w_att)) > 0)
718 {
719 style.width.value = w;
720 style.width.unit = strchr(w_att, '%') ? N_PERCENT : N_LENGTH;
721 }
722 if (h_att && (h = fz_atoi(h_att)) > 0)
723 {
724 style.height.value = h;
725 style.height.unit = strchr(h_att, '%') ? N_PERCENT : N_LENGTH;
726 }
727
728 if (display == DIS_BLOCK)
729 {
730 fz_html_box *imgbox;
731 box = new_box(ctx, g->pool, markup_dir);
732 box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
733 top = insert_block_box(ctx, box, top);
734 imgbox = new_short_box(ctx, g->pool, markup_dir);
735 imgbox->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
736 insert_inline_box(ctx, imgbox, box, markup_dir, g);
737 generate_image(ctx, imgbox, load_html_image(ctx, g->zip, g->base_uri, src), g);
738 }
739 else if (display == DIS_INLINE)
740 {
741 box = new_short_box(ctx, g->pool, markup_dir);
742 box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
743 insert_inline_box(ctx, box, top, markup_dir, g);
744 generate_image(ctx, box, load_html_image(ctx, g->zip, g->base_uri, src), g);
745 }
746 }
747 }
748
749 else if (tag[0]=='s' && tag[1]=='v' && tag[2]=='g' && tag[3]==0)
750 {
751 box = new_short_box(ctx, g->pool, markup_dir);
752 box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
753 insert_inline_box(ctx, box, top, markup_dir, g);
754 generate_image(ctx, box, load_svg_image(ctx, g->zip, g->base_uri, node), g);
755 }
756
757 else if (g->is_fb2 && tag[0]=='i' && tag[1]=='m' && tag[2]=='a' && tag[3]=='g' && tag[4]=='e' && tag[5]==0)
758 {
759 const char *src = fz_xml_att(node, "l:href");
760 if (!src)
761 src = fz_xml_att(node, "xlink:href");
762 if (src && src[0] == '#')
763 {
764 fz_image *img = fz_tree_lookup(ctx, g->images, src+1);
765 if (display == DIS_BLOCK)
766 {
767 fz_html_box *imgbox;
768 box = new_box(ctx, g->pool, markup_dir);
769 box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
770 top = insert_block_box(ctx, box, top);
771 imgbox = new_short_box(ctx, g->pool, markup_dir);
772 imgbox->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
773 insert_inline_box(ctx, imgbox, box, markup_dir, g);
774 generate_image(ctx, imgbox, fz_keep_image(ctx, img), g);
775 }
776 else if (display == DIS_INLINE)
777 {
778 box = new_short_box(ctx, g->pool, markup_dir);
779 box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
780 insert_inline_box(ctx, box, top, markup_dir, g);
781 generate_image(ctx, box, fz_keep_image(ctx, img), g);
782 }
783 }
784 }
785
786 else if (display != DIS_NONE)
787 {
788 const char *dir, *lang, *id, *href;
789 int child_dir = markup_dir;
790 int child_lang = markup_lang;
791
792 dir = fz_xml_att(node, "dir");
793 if (dir)
794 {
795 if (!strcmp(dir, "auto"))
796 child_dir = FZ_BIDI_NEUTRAL;
797 else if (!strcmp(dir, "rtl"))
798 child_dir = FZ_BIDI_RTL;
799 else if (!strcmp(dir, "ltr"))
800 child_dir = FZ_BIDI_LTR;
801 else
802 child_dir = DEFAULT_DIR;
803 }
804
805 lang = fz_xml_att(node, "lang");
806 if (lang)
807 child_lang = fz_text_language_from_string(lang);
808
809 if (display == DIS_INLINE)
810 box = new_short_box(ctx, g->pool, child_dir);
811 else
812 box = new_box(ctx, g->pool, child_dir);
813 box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
814
815 id = fz_xml_att(node, "id");
816 if (id)
817 box->id = fz_pool_strdup(ctx, g->pool, id);
818
819 if (display == DIS_BLOCK || display == DIS_INLINE_BLOCK)
820 {
821 top = insert_block_box(ctx, box, top);
822 if (g->is_fb2)
823 {
824 if (!strcmp(tag, "title") || !strcmp(tag, "subtitle"))
825 box->heading = fz_mini(section_depth, 6);
826 }
827 else
828 {
829 if (tag[0]=='h' && tag[1]>='1' && tag[1]<='6' && tag[2]==0)
830 box->heading = tag[1] - '0';
831 }
832 }
833 else if (display == DIS_LIST_ITEM)
834 {
835 top = insert_block_box(ctx, box, top);
836 box->list_item = ++list_counter;
837 }
838 else if (display == DIS_INLINE)
839 {
840 insert_inline_box(ctx, box, top, child_dir, g);
841 if (id)
842 generate_anchor(ctx, box, g);
843 if (tag[0]=='a' && tag[1]==0)
844 {
845 if (g->is_fb2)
846 {
847 href = fz_xml_att(node, "l:href");
848 if (!href)
849 href = fz_xml_att(node, "xlink:href");
850 }
851 else
852 href = fz_xml_att(node, g->is_fb2 ? "l:href" : "href");
853 if (href)
854 box->href = fz_pool_strdup(ctx, g->pool, href);
855 }
856 }
857 else if (display == DIS_TABLE)
858 {
859 top = insert_table_box(ctx, box, top);
860 }
861 else if (display == DIS_TABLE_ROW)
862 {
863 top = insert_table_row_box(ctx, box, top);
864 }
865 else if (display == DIS_TABLE_CELL)
866 {
867 top = insert_table_cell_box(ctx, box, top);
868 }
869 else
870 {
871 fz_warn(ctx, "unknown box display type");
872 insert_box(ctx, box, BOX_BLOCK, top);
873 }
874
875 if (fz_xml_down(node))
876 {
877 int child_counter = list_counter;
878 int child_section = section_depth;
879 if (!strcmp(tag, "ul") || !strcmp(tag, "ol"))
880 child_counter = 0;
881 else if (!strcmp(tag, "section"))
882 ++child_section;
883 last_top = generate_boxes(ctx,
884 fz_xml_down(node),
885 box,
886 &match,
887 child_counter,
888 child_section,
889 child_dir,
890 child_lang,
891 g);
892 if (last_top != box)
893 top = last_top;
894 }
895 }
896 }
897 else
898 {
899 const char *text = fz_xml_text(node);
900 int collapse = top->style->white_space & WS_COLLAPSE;
901 if (collapse && is_all_white(text))
902 {
903 g->emit_white = 1;
904 }
905 else
906 {
907 if (top->type != BOX_INLINE)
908 {
909 /* Create anonymous inline box, with the same style as the top block box. */
910 fz_css_style style;
911 box = new_short_box(ctx, g->pool, markup_dir);
912 fz_default_css_style(ctx, &style);
913 box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
914 insert_inline_box(ctx, box, top, markup_dir, g);
915 style = *top->style;
916 /* Make sure not to recursively multiply font sizes. */
917 style.font_size.value = 1;
918 style.font_size.unit = N_SCALE;
919 box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
920 generate_text(ctx, box, text, markup_lang, g);
921 }
922 else
923 {
924 generate_text(ctx, top, text, markup_lang, g);
925 }
926 }
927 }
928
929 node = fz_xml_next(node);
930 }
931
932 return top;
933 }
934
concat_text(fz_context * ctx,fz_xml * root)935 static char *concat_text(fz_context *ctx, fz_xml *root)
936 {
937 fz_xml *node;
938 size_t i = 0, n = 1;
939 char *s;
940 for (node = fz_xml_down(root); node; node = fz_xml_next(node))
941 {
942 const char *text = fz_xml_text(node);
943 n += text ? strlen(text) : 0;
944 }
945 s = Memento_label(fz_malloc(ctx, n), "concat_html");
946 for (node = fz_xml_down(root); node; node = fz_xml_next(node))
947 {
948 const char *text = fz_xml_text(node);
949 if (text)
950 {
951 n = strlen(text);
952 memcpy(s+i, text, n);
953 i += n;
954 }
955 }
956 s[i] = 0;
957 return s;
958 }
959
960 static void
html_load_css_link(fz_context * ctx,fz_html_font_set * set,fz_archive * zip,const char * base_uri,fz_css * css,fz_xml * root,const char * href)961 html_load_css_link(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root, const char *href)
962 {
963 char path[2048];
964 char css_base_uri[2048];
965 fz_buffer *buf;
966
967 fz_var(buf);
968
969 fz_strlcpy(path, base_uri, sizeof path);
970 fz_strlcat(path, "/", sizeof path);
971 fz_strlcat(path, href, sizeof path);
972 fz_urldecode(path);
973 fz_cleanname(path);
974
975 fz_dirname(css_base_uri, path, sizeof css_base_uri);
976
977 buf = NULL;
978 fz_try(ctx)
979 {
980 buf = fz_read_archive_entry(ctx, zip, path);
981 fz_parse_css(ctx, css, fz_string_from_buffer(ctx, buf), path);
982 fz_add_css_font_faces(ctx, set, zip, css_base_uri, css);
983 }
984 fz_always(ctx)
985 fz_drop_buffer(ctx, buf);
986 fz_catch(ctx)
987 fz_warn(ctx, "ignoring stylesheet %s", path);
988 }
989
990 static void
html_load_css(fz_context * ctx,fz_html_font_set * set,fz_archive * zip,const char * base_uri,fz_css * css,fz_xml * root)991 html_load_css(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root)
992 {
993 fz_xml *html, *head, *node;
994
995 html = fz_xml_find(root, "html");
996 head = fz_xml_find_down(html, "head");
997 for (node = fz_xml_down(head); node; node = fz_xml_next(node))
998 {
999 if (fz_xml_is_tag(node, "link"))
1000 {
1001 char *rel = fz_xml_att(node, "rel");
1002 if (rel && !fz_strcasecmp(rel, "stylesheet"))
1003 {
1004 char *type = fz_xml_att(node, "type");
1005 if ((type && !strcmp(type, "text/css")) || !type)
1006 {
1007 char *href = fz_xml_att(node, "href");
1008 if (href)
1009 {
1010 html_load_css_link(ctx, set, zip, base_uri, css, root, href);
1011 }
1012 }
1013 }
1014 }
1015 else if (fz_xml_is_tag(node, "style"))
1016 {
1017 char *s = concat_text(ctx, node);
1018 fz_try(ctx)
1019 {
1020 fz_parse_css(ctx, css, s, "<style>");
1021 fz_add_css_font_faces(ctx, set, zip, base_uri, css);
1022 }
1023 fz_catch(ctx)
1024 fz_warn(ctx, "ignoring inline stylesheet");
1025 fz_free(ctx, s);
1026 }
1027 }
1028 }
1029
1030 static void
fb2_load_css(fz_context * ctx,fz_html_font_set * set,fz_archive * zip,const char * base_uri,fz_css * css,fz_xml * root)1031 fb2_load_css(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root)
1032 {
1033 fz_xml *fictionbook, *stylesheet;
1034
1035 fictionbook = fz_xml_find(root, "FictionBook");
1036 stylesheet = fz_xml_find_down(fictionbook, "stylesheet");
1037 if (stylesheet)
1038 {
1039 char *s = concat_text(ctx, stylesheet);
1040 fz_try(ctx)
1041 {
1042 fz_parse_css(ctx, css, s, "<stylesheet>");
1043 fz_add_css_font_faces(ctx, set, zip, base_uri, css);
1044 }
1045 fz_catch(ctx)
1046 fz_warn(ctx, "ignoring inline stylesheet");
1047 fz_free(ctx, s);
1048 }
1049 }
1050
1051 static fz_tree *
load_fb2_images(fz_context * ctx,fz_xml * root)1052 load_fb2_images(fz_context *ctx, fz_xml *root)
1053 {
1054 fz_xml *fictionbook, *binary;
1055 fz_tree *images = NULL;
1056
1057 fictionbook = fz_xml_find(root, "FictionBook");
1058 for (binary = fz_xml_find_down(fictionbook, "binary"); binary; binary = fz_xml_find_next(binary, "binary"))
1059 {
1060 const char *id = fz_xml_att(binary, "id");
1061 char *b64 = NULL;
1062 fz_buffer *buf = NULL;
1063 fz_image *img = NULL;
1064
1065 fz_var(b64);
1066 fz_var(buf);
1067
1068 fz_try(ctx)
1069 {
1070 b64 = concat_text(ctx, binary);
1071 buf = fz_new_buffer_from_base64(ctx, b64, strlen(b64));
1072 img = fz_new_image_from_buffer(ctx, buf);
1073 }
1074 fz_always(ctx)
1075 {
1076 fz_drop_buffer(ctx, buf);
1077 fz_free(ctx, b64);
1078 }
1079 fz_catch(ctx)
1080 fz_rethrow(ctx);
1081
1082 images = fz_tree_insert(ctx, images, id, img);
1083 }
1084
1085 return images;
1086 }
1087
1088 typedef struct
1089 {
1090 uint32_t *data;
1091 size_t cap;
1092 size_t len;
1093 } uni_buf;
1094
1095 typedef struct
1096 {
1097 fz_context *ctx;
1098 fz_pool *pool;
1099 fz_html_flow *flow;
1100 uni_buf *buffer;
1101 } bidi_data;
1102
fragment_cb(const uint32_t * fragment,size_t fragment_len,int bidi_level,int script,void * arg)1103 static void fragment_cb(const uint32_t *fragment,
1104 size_t fragment_len,
1105 int bidi_level,
1106 int script,
1107 void *arg)
1108 {
1109 bidi_data *data = (bidi_data *)arg;
1110 size_t fragment_offset = fragment - data->buffer->data;
1111
1112 /* We are guaranteed that fragmentOffset will be at the beginning
1113 * of flow. */
1114 while (fragment_len > 0)
1115 {
1116 size_t len;
1117
1118 if (data->flow->type == FLOW_SPACE)
1119 {
1120 len = 1;
1121 }
1122 else if (data->flow->type == FLOW_BREAK || data->flow->type == FLOW_SBREAK ||
1123 data->flow->type == FLOW_SHYPHEN || data->flow->type == FLOW_ANCHOR)
1124 {
1125 len = 0;
1126 }
1127 else
1128 {
1129 /* Must be text */
1130 len = fz_utflen(data->flow->content.text);
1131 if (len > fragment_len)
1132 {
1133 /* We need to split this flow box */
1134 (void)split_flow(data->ctx, data->pool, data->flow, fragment_len);
1135 len = fz_utflen(data->flow->content.text);
1136 }
1137 }
1138
1139 /* This flow box is entirely contained within this fragment. */
1140 data->flow->bidi_level = bidi_level;
1141 data->flow->script = script;
1142 data->flow = data->flow->next;
1143 fragment_offset += len;
1144 fragment_len -= len;
1145 }
1146 }
1147
1148 static fz_bidi_direction
detect_flow_directionality(fz_context * ctx,fz_pool * pool,uni_buf * buffer,fz_bidi_direction bidi_dir,fz_html_flow * flow)1149 detect_flow_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_bidi_direction bidi_dir, fz_html_flow *flow)
1150 {
1151 fz_html_flow *end = flow;
1152 bidi_data data;
1153
1154 while (end)
1155 {
1156 int level = end->bidi_level;
1157
1158 /* Gather the text from the flow up into a single buffer (at
1159 * least, as much of it as has the same direction markup). */
1160 buffer->len = 0;
1161 while (end && (level & 1) == (end->bidi_level & 1))
1162 {
1163 size_t len = 0;
1164 const char *text = "";
1165 int broken = 0;
1166
1167 switch (end->type)
1168 {
1169 case FLOW_WORD:
1170 len = fz_utflen(end->content.text);
1171 text = end->content.text;
1172 break;
1173 case FLOW_SPACE:
1174 len = 1;
1175 text = " ";
1176 break;
1177 case FLOW_SHYPHEN:
1178 case FLOW_SBREAK:
1179 break;
1180 case FLOW_BREAK:
1181 case FLOW_IMAGE:
1182 broken = 1;
1183 break;
1184 }
1185
1186 end = end->next;
1187
1188 if (broken)
1189 break;
1190
1191 /* Make sure the buffer is large enough */
1192 if (buffer->len + len > buffer->cap)
1193 {
1194 size_t newcap = buffer->cap;
1195 if (newcap < 128)
1196 newcap = 128; /* Sensible small default */
1197
1198 while (newcap < buffer->len + len)
1199 newcap = (newcap * 3) / 2;
1200
1201 buffer->data = fz_realloc_array(ctx, buffer->data, newcap, uint32_t);
1202 buffer->cap = newcap;
1203 }
1204
1205 /* Expand the utf8 text into Unicode and store it in the buffer */
1206 while (*text)
1207 {
1208 int rune;
1209 text += fz_chartorune(&rune, text);
1210 buffer->data[buffer->len++] = rune;
1211 }
1212 }
1213
1214 /* Detect directionality for the buffer */
1215 data.ctx = ctx;
1216 data.pool = pool;
1217 data.flow = flow;
1218 data.buffer = buffer;
1219 fz_bidi_fragment_text(ctx, buffer->data, buffer->len, &bidi_dir, fragment_cb, &data, 0 /* Flags */);
1220 flow = end;
1221 }
1222 return bidi_dir;
1223 }
1224
1225 static void
detect_box_directionality(fz_context * ctx,fz_pool * pool,uni_buf * buffer,fz_html_box * box)1226 detect_box_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_html_box *box)
1227 {
1228 while (box)
1229 {
1230 if (box->flow_head)
1231 box->markup_dir = detect_flow_directionality(ctx, pool, buffer, box->markup_dir, box->flow_head);
1232 detect_box_directionality(ctx, pool, buffer, box->down);
1233 box = box->next;
1234 }
1235 }
1236
1237 static void
detect_directionality(fz_context * ctx,fz_pool * pool,fz_html_box * box)1238 detect_directionality(fz_context *ctx, fz_pool *pool, fz_html_box *box)
1239 {
1240 uni_buf buffer = { NULL };
1241
1242 fz_try(ctx)
1243 detect_box_directionality(ctx, pool, &buffer, box);
1244 fz_always(ctx)
1245 fz_free(ctx, buffer.data);
1246 fz_catch(ctx)
1247 fz_rethrow(ctx);
1248 }
1249
1250 /* Here we look for places where box->next actually means
1251 * 'the last of my children', and correct it by setting
1252 * next == NULL. We can spot these because box->next->up == box. */
1253 static void
fix_nexts(fz_html_box * box)1254 fix_nexts(fz_html_box *box)
1255 {
1256 while (box)
1257 {
1258 if (box->down)
1259 fix_nexts(box->down);
1260 if (box->next && box->next->up == box)
1261 {
1262 box->next = NULL;
1263 break;
1264 }
1265 box = box->next;
1266 }
1267 }
1268
1269 static fz_html *
fz_parse_html_imp(fz_context * ctx,fz_html_font_set * set,fz_archive * zip,const char * base_uri,fz_buffer * buf,const char * user_css,int try_xml,int try_html5)1270 fz_parse_html_imp(fz_context *ctx,
1271 fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css,
1272 int try_xml, int try_html5)
1273 {
1274 fz_xml_doc *xml;
1275 fz_xml *root, *node;
1276 fz_html *html = NULL;
1277 char *title;
1278
1279 fz_css_match match;
1280 struct genstate g;
1281
1282 g.pool = NULL;
1283 g.set = set;
1284 g.zip = zip;
1285 g.images = NULL;
1286 g.base_uri = base_uri;
1287 g.css = NULL;
1288 g.at_bol = 0;
1289 g.emit_white = 0;
1290 g.last_brk_cls = UCDN_LINEBREAK_CLASS_OP;
1291 g.styles = NULL;
1292
1293 if (try_xml && try_html5)
1294 {
1295 fz_try(ctx)
1296 xml = fz_parse_xml(ctx, buf, 1);
1297 fz_catch(ctx)
1298 {
1299 if (fz_caught(ctx) == FZ_ERROR_SYNTAX)
1300 {
1301 fz_warn(ctx, "syntax error in XHTML; retrying using HTML5 parser");
1302 xml = fz_parse_xml_from_html5(ctx, buf);
1303 }
1304 else
1305 fz_rethrow(ctx);
1306 }
1307 }
1308 else if (try_xml)
1309 xml = fz_parse_xml(ctx, buf, 1);
1310 else if (try_html5)
1311 xml = fz_parse_xml_from_html5(ctx, buf);
1312 else
1313 return NULL; /* should never happen! */
1314
1315 root = fz_xml_root(xml);
1316
1317 fz_try(ctx)
1318 g.css = fz_new_css(ctx);
1319 fz_catch(ctx)
1320 {
1321 fz_drop_xml(ctx, xml);
1322 fz_rethrow(ctx);
1323 }
1324
1325 #ifndef NDEBUG
1326 if (fz_atoi(getenv("FZ_DEBUG_XML")))
1327 fz_debug_xml(root, 0);
1328 #endif
1329
1330 fz_try(ctx)
1331 {
1332 if (fz_xml_find(root, "FictionBook"))
1333 {
1334 g.is_fb2 = 1;
1335 fz_parse_css(ctx, g.css, fb2_default_css, "<default:fb2>");
1336 if (fz_use_document_css(ctx))
1337 fb2_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root);
1338 g.images = load_fb2_images(ctx, root);
1339 }
1340 else
1341 {
1342 g.is_fb2 = 0;
1343 fz_parse_css(ctx, g.css, html_default_css, "<default:html>");
1344 if (fz_use_document_css(ctx))
1345 html_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root);
1346 g.images = NULL;
1347 }
1348
1349 if (user_css)
1350 {
1351 fz_parse_css(ctx, g.css, user_css, "<user>");
1352 fz_add_css_font_faces(ctx, g.set, g.zip, ".", g.css);
1353 }
1354 }
1355 fz_catch(ctx)
1356 {
1357 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
1358 fz_warn(ctx, "ignoring styles due to errors: %s", fz_caught_message(ctx));
1359 }
1360
1361 #ifndef NDEBUG
1362 if (fz_atoi(getenv("FZ_DEBUG_CSS")))
1363 fz_debug_css(ctx, g.css);
1364 #endif
1365
1366 fz_try(ctx)
1367 {
1368 fz_css_style style;
1369
1370 g.pool = fz_new_pool(ctx);
1371 html = fz_pool_alloc(ctx, g.pool, sizeof *html);
1372 FZ_INIT_STORABLE(html, 1, fz_drop_html_imp);
1373 html->pool = g.pool;
1374 html->root = new_box(ctx, g.pool, DEFAULT_DIR);
1375 html->layout_w = 0;
1376 html->layout_h = 0;
1377 html->layout_em = 0;
1378
1379 fz_match_css_at_page(ctx, &match, g.css);
1380 fz_apply_css_style(ctx, g.set, &style, &match);
1381 html->root->style = fz_css_enlist(ctx, &style, &g.styles, g.pool);
1382 // TODO: transfer page margins out of this hacky box
1383
1384 generate_boxes(ctx, root, html->root, &match, 0, 0, DEFAULT_DIR, FZ_LANG_UNSET, &g);
1385 fix_nexts(html->root);
1386
1387 detect_directionality(ctx, g.pool, html->root);
1388
1389 if (g.is_fb2)
1390 {
1391 node = fz_xml_find(root, "FictionBook");
1392 node = fz_xml_find_down(node, "description");
1393 node = fz_xml_find_down(node, "title-info");
1394 node = fz_xml_find_down(node, "book-title");
1395 title = fz_xml_text(fz_xml_down(node));
1396 if (title)
1397 html->title = fz_pool_strdup(ctx, g.pool, title);
1398 }
1399 else
1400 {
1401 node = fz_xml_find(root, "html");
1402 node = fz_xml_find_down(node, "head");
1403 node = fz_xml_find_down(node, "title");
1404 title = fz_xml_text(fz_xml_down(node));
1405 if (title)
1406 html->title = fz_pool_strdup(ctx, g.pool, title);
1407 }
1408 }
1409 fz_always(ctx)
1410 {
1411 fz_drop_tree(ctx, g.images, (void(*)(fz_context*,void*))fz_drop_image);
1412 fz_drop_css(ctx, g.css);
1413 fz_drop_xml(ctx, xml);
1414 }
1415 fz_catch(ctx)
1416 {
1417 fz_drop_html(ctx, html);
1418 fz_rethrow(ctx);
1419 }
1420
1421 return html;
1422 }
1423
1424 fz_html *
fz_parse_fb2(fz_context * ctx,fz_html_font_set * set,fz_archive * zip,const char * base_uri,fz_buffer * buf,const char * user_css)1425 fz_parse_fb2(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css)
1426 {
1427 /* parse only as XML */
1428 return fz_parse_html_imp(ctx, set, zip, base_uri, buf, user_css, 1, 0);
1429 }
1430
1431 fz_html *
fz_parse_html5(fz_context * ctx,fz_html_font_set * set,fz_archive * zip,const char * base_uri,fz_buffer * buf,const char * user_css)1432 fz_parse_html5(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css)
1433 {
1434 /* parse only as HTML5 */
1435 return fz_parse_html_imp(ctx, set, zip, base_uri, buf, user_css, 0, 1);
1436 }
1437
1438 fz_html *
fz_parse_xhtml(fz_context * ctx,fz_html_font_set * set,fz_archive * zip,const char * base_uri,fz_buffer * buf,const char * user_css)1439 fz_parse_xhtml(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css)
1440 {
1441 /* try as XML first, fall back to HTML5 */
1442 return fz_parse_html_imp(ctx, set, zip, base_uri, buf, user_css, 1, 1);
1443 }
1444
indent(int level)1445 static void indent(int level)
1446 {
1447 while (level-- > 0)
1448 putchar('\t');
1449 }
1450
1451 static void
fz_debug_html_flow(fz_context * ctx,fz_html_flow * flow,int level)1452 fz_debug_html_flow(fz_context *ctx, fz_html_flow *flow, int level)
1453 {
1454 fz_html_box *sbox = NULL;
1455 while (flow)
1456 {
1457 if (flow->box != sbox) {
1458 if (sbox) {
1459 indent(level);
1460 printf("}\n");
1461 }
1462 sbox = flow->box;
1463 indent(level);
1464 printf("span em=%g font='%s'", sbox->em, fz_font_name(ctx, sbox->style->font));
1465 if (fz_font_is_serif(ctx, sbox->style->font))
1466 printf(" serif");
1467 else
1468 printf(" sans");
1469 if (fz_font_is_monospaced(ctx, sbox->style->font))
1470 printf(" monospaced");
1471 if (fz_font_is_bold(ctx, sbox->style->font))
1472 printf(" bold");
1473 if (fz_font_is_italic(ctx, sbox->style->font))
1474 printf(" italic");
1475 if (sbox->style->small_caps)
1476 printf(" small-caps");
1477 printf("\n");
1478 indent(level);
1479 printf("{\n");
1480 }
1481
1482 indent(level+1);
1483 switch (flow->type) {
1484 case FLOW_WORD: printf("word "); break;
1485 case FLOW_SPACE: printf("space"); break;
1486 case FLOW_SBREAK: printf("sbrk "); break;
1487 case FLOW_SHYPHEN: printf("shy "); break;
1488 case FLOW_BREAK: printf("break"); break;
1489 case FLOW_IMAGE: printf("image"); break;
1490 case FLOW_ANCHOR: printf("anchor"); break;
1491 }
1492 printf(" y=%g x=%g w=%g", flow->y, flow->x, flow->w);
1493 if (flow->type == FLOW_IMAGE)
1494 printf(" h=%g", flow->h);
1495 if (flow->type == FLOW_WORD)
1496 printf(" text='%s'", flow->content.text);
1497 printf("\n");
1498 if (flow->breaks_line) {
1499 indent(level+1);
1500 printf("*\n");
1501 }
1502
1503 flow = flow->next;
1504 }
1505 indent(level);
1506 printf("}\n");
1507 }
1508
1509 static void
fz_debug_html_box(fz_context * ctx,fz_html_box * box,int level)1510 fz_debug_html_box(fz_context *ctx, fz_html_box *box, int level)
1511 {
1512 while (box)
1513 {
1514 indent(level);
1515 switch (box->type) {
1516 case BOX_BLOCK: printf("block"); break;
1517 case BOX_FLOW: printf("flow"); break;
1518 case BOX_INLINE: printf("inline"); break;
1519 case BOX_TABLE: printf("table"); break;
1520 case BOX_TABLE_ROW: printf("table-row"); break;
1521 case BOX_TABLE_CELL: printf("table-cell"); break;
1522 }
1523
1524 printf(" em=%g x=%g y=%g w=%g b=%g\n", box->em, box->x, box->y, box->w, box->b);
1525
1526 indent(level);
1527 printf("{\n");
1528 if (box->type == BOX_BLOCK) {
1529 indent(level+1);
1530 printf("margin=%g %g %g %g\n", box->margin[0], box->margin[1], box->margin[2], box->margin[3]);
1531 }
1532 if (box->is_first_flow) {
1533 indent(level+1);
1534 printf("is-first-flow\n");
1535 }
1536 if (box->list_item) {
1537 indent(level+1);
1538 printf("list=%d\n", box->list_item);
1539 }
1540 if (box->id) {
1541 indent(level+1);
1542 printf("id=%s\n", box->id);
1543 }
1544 if (box->href) {
1545 indent(level+1);
1546 printf("href=%s\n", box->href);
1547 }
1548
1549 if (box->down)
1550 fz_debug_html_box(ctx, box->down, level + 1);
1551 if (box->flow_head)
1552 fz_debug_html_flow(ctx, box->flow_head, level + 1);
1553
1554 indent(level);
1555 printf("}\n");
1556
1557 box = box->next;
1558 }
1559 }
1560
1561 void
fz_debug_html(fz_context * ctx,fz_html_box * box)1562 fz_debug_html(fz_context *ctx, fz_html_box *box)
1563 {
1564 fz_debug_html_box(ctx, box, 0);
1565 }
1566
1567 static size_t
fz_html_size(fz_context * ctx,fz_html * html)1568 fz_html_size(fz_context *ctx, fz_html *html)
1569 {
1570 return html ? fz_pool_size(ctx, html->pool) : 0;
1571 }
1572
1573 /* Magic to make html storable. */
1574 typedef struct {
1575 int refs;
1576 void *doc;
1577 int chapter_num;
1578 } fz_html_key;
1579
1580 static int
fz_make_hash_html_key(fz_context * ctx,fz_store_hash * hash,void * key_)1581 fz_make_hash_html_key(fz_context *ctx, fz_store_hash *hash, void *key_)
1582 {
1583 fz_html_key *key = (fz_html_key *)key_;
1584 hash->u.pi.ptr = key->doc;
1585 hash->u.pi.i = key->chapter_num;
1586 return 1;
1587 }
1588
1589 static void *
fz_keep_html_key(fz_context * ctx,void * key_)1590 fz_keep_html_key(fz_context *ctx, void *key_)
1591 {
1592 fz_html_key *key = (fz_html_key *)key_;
1593 return fz_keep_imp(ctx, key, &key->refs);
1594 }
1595
1596 static void
fz_drop_html_key(fz_context * ctx,void * key_)1597 fz_drop_html_key(fz_context *ctx, void *key_)
1598 {
1599 fz_html_key *key = (fz_html_key *)key_;
1600 if (fz_drop_imp(ctx, key, &key->refs))
1601 {
1602 fz_free(ctx, key);
1603 }
1604 }
1605
1606 static int
fz_cmp_html_key(fz_context * ctx,void * k0_,void * k1_)1607 fz_cmp_html_key(fz_context *ctx, void *k0_, void *k1_)
1608 {
1609 fz_html_key *k0 = (fz_html_key *)k0_;
1610 fz_html_key *k1 = (fz_html_key *)k1_;
1611 return k0->doc == k1->doc && k0->chapter_num == k1->chapter_num;
1612 }
1613
1614 static void
fz_format_html_key(fz_context * ctx,char * s,size_t n,void * key_)1615 fz_format_html_key(fz_context *ctx, char *s, size_t n, void *key_)
1616 {
1617 fz_html_key *key = (fz_html_key *)key_;
1618 fz_snprintf(s, n, "(html doc=%p, ch=%d)", key->doc, key->chapter_num);
1619 }
1620
1621 static const fz_store_type fz_html_store_type =
1622 {
1623 "fz_html",
1624 fz_make_hash_html_key,
1625 fz_keep_html_key,
1626 fz_drop_html_key,
1627 fz_cmp_html_key,
1628 fz_format_html_key,
1629 NULL
1630 };
1631
fz_store_html(fz_context * ctx,fz_html * html,void * doc,int chapter)1632 fz_html *fz_store_html(fz_context *ctx, fz_html *html, void *doc, int chapter)
1633 {
1634 fz_html_key *key = NULL;
1635 fz_html *other_html;
1636
1637 /* Stick the parsed html in the store */
1638 fz_var(key);
1639
1640 fz_try(ctx)
1641 {
1642 key = fz_malloc_struct(ctx, fz_html_key);
1643 key->refs = 1;
1644 key->doc = doc;
1645 key->chapter_num = chapter;
1646 other_html = fz_store_item(ctx, key, html, fz_html_size(ctx, html), &fz_html_store_type);
1647 if (other_html)
1648 {
1649 fz_drop_html(ctx, html);
1650 html = other_html;
1651 }
1652 }
1653 fz_always(ctx)
1654 fz_drop_html_key(ctx, key);
1655 fz_catch(ctx)
1656 {
1657 /* Do nothing */
1658 }
1659
1660 return html;
1661 }
1662
fz_find_html(fz_context * ctx,void * doc,int chapter)1663 fz_html *fz_find_html(fz_context *ctx, void *doc, int chapter)
1664 {
1665 fz_html_key key;
1666
1667 key.refs = 1;
1668 key.doc = doc;
1669 key.chapter_num = chapter;
1670 return fz_find_item(ctx, &fz_drop_html_imp, &key, &fz_html_store_type);
1671 }
1672
1673 static int
html_filter_store(fz_context * ctx,void * doc,void * key_)1674 html_filter_store(fz_context *ctx, void *doc, void *key_)
1675 {
1676 fz_html_key *key = (fz_html_key *)key_;
1677
1678 return (doc == key->doc);
1679 }
1680
fz_purge_stored_html(fz_context * ctx,void * doc)1681 void fz_purge_stored_html(fz_context *ctx, void *doc)
1682 {
1683 fz_filter_store(ctx, html_filter_store, doc, &fz_html_store_type);
1684 }
1685