1 #include "mupdf/fitz.h"
2 #include "mupdf/ucdn.h"
3 #include "html-imp.h"
4 
5 #include <string.h>
6 #include <stdio.h>
7 #include <assert.h>
8 
9 enum { T, R, B, L };
10 
11 #define DEFAULT_DIR FZ_BIDI_LTR
12 
13 static const char *html_default_css =
14 "@page{margin:3em 2em}"
15 "a{color:#06C;text-decoration:underline}"
16 "address{display:block;font-style:italic}"
17 "b{font-weight:bold}"
18 "bdo{direction:rtl;unicode-bidi:bidi-override}"
19 "blockquote{display:block;margin:1em 40px}"
20 "body{display:block;margin:1em}"
21 "cite{font-style:italic}"
22 "code{font-family:monospace}"
23 "dd{display:block;margin:0 0 0 40px}"
24 "del{text-decoration:line-through}"
25 "div{display:block}"
26 "dl{display:block;margin:1em 0}"
27 "dt{display:block}"
28 "em{font-style:italic}"
29 "h1{display:block;font-size:2em;font-weight:bold;margin:0.67em 0;page-break-after:avoid}"
30 "h2{display:block;font-size:1.5em;font-weight:bold;margin:0.83em 0;page-break-after:avoid}"
31 "h3{display:block;font-size:1.17em;font-weight:bold;margin:1em 0;page-break-after:avoid}"
32 "h4{display:block;font-size:1em;font-weight:bold;margin:1.33em 0;page-break-after:avoid}"
33 "h5{display:block;font-size:0.83em;font-weight:bold;margin:1.67em 0;page-break-after:avoid}"
34 "h6{display:block;font-size:0.67em;font-weight:bold;margin:2.33em 0;page-break-after:avoid}"
35 "head{display:none}"
36 "hr{border-style:solid;border-width:1px;display:block;margin-bottom:0.5em;margin-top:0.5em;text-align:center}"
37 "html{display:block}"
38 "i{font-style:italic}"
39 "ins{text-decoration:underline}"
40 "kbd{font-family:monospace}"
41 "li{display:list-item}"
42 "menu{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}"
43 "ol{display:block;list-style-type:decimal;margin:1em 0;padding:0 0 0 30pt}"
44 "p{display:block;margin:1em 0}"
45 "pre{display:block;font-family:monospace;margin:1em 0;white-space:pre}"
46 "samp{font-family:monospace}"
47 "script{display:none}"
48 "small{font-size:0.83em}"
49 "strong{font-weight:bold}"
50 "style{display:none}"
51 "sub{font-size:0.83em;vertical-align:sub}"
52 "sup{font-size:0.83em;vertical-align:super}"
53 "table{display:table}"
54 "tbody{display:table-row-group}"
55 "td{display:table-cell;padding:1px}"
56 "tfoot{display:table-footer-group}"
57 "th{display:table-cell;font-weight:bold;padding:1px;text-align:center}"
58 "thead{display:table-header-group}"
59 "tr{display:table-row}"
60 "ul{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}"
61 "ul ul{list-style-type:circle}"
62 "ul ul ul{list-style-type:square}"
63 "var{font-style:italic}"
64 "svg{display:none}"
65 ;
66 
67 static const char *fb2_default_css =
68 "@page{margin:3em 2em}"
69 "FictionBook{display:block;margin:1em}"
70 "stylesheet,binary{display:none}"
71 "description>*{display:none}"
72 "description>title-info{display:block}"
73 "description>title-info>*{display:none}"
74 "description>title-info>coverpage{display:block;page-break-before:always;page-break-after:always}"
75 "body,section,title,subtitle,p,cite,epigraph,text-author,date,poem,stanza,v,empty-line{display:block}"
76 "image{display:block}"
77 "p>image{display:inline}"
78 "table{display:table}"
79 "tr{display:table-row}"
80 "th,td{display:table-cell}"
81 "a{color:#06C;text-decoration:underline}"
82 "a[type=note]{font-size:small;vertical-align:super}"
83 "code{white-space:pre;font-family:monospace}"
84 "emphasis{font-style:italic}"
85 "strikethrough{text-decoration:line-through}"
86 "strong{font-weight:bold}"
87 "sub{font-size:small;vertical-align:sub}"
88 "sup{font-size:small;vertical-align:super}"
89 "image{margin:1em 0;text-align:center}"
90 "cite,poem{margin:1em 2em}"
91 "subtitle,epigraph,stanza{margin:1em 0}"
92 "title>p{text-align:center;font-size:x-large}"
93 "subtitle{text-align:center;font-size:large}"
94 "p{margin-top:1em;text-align:justify}"
95 "empty-line{padding-top:1em}"
96 "p+p{margin-top:0;text-indent:1.5em}"
97 "empty-line+p{margin-top:0}"
98 "section>title{page-break-before:always}"
99 ;
100 
101 struct genstate
102 {
103 	fz_pool *pool;
104 	fz_html_font_set *set;
105 	fz_archive *zip;
106 	fz_tree *images;
107 	int is_fb2;
108 	const char *base_uri;
109 	fz_css *css;
110 	int at_bol;
111 	int emit_white;
112 	int last_brk_cls;
113 	fz_css_style_splay *styles;
114 };
115 
iswhite(int c)116 static int iswhite(int c)
117 {
118 	return c == ' ' || c == '\t' || c == '\r' || c == '\n';
119 }
120 
is_all_white(const char * s)121 static int is_all_white(const char *s)
122 {
123 	while (*s)
124 	{
125 		if (!iswhite(*s))
126 			return 0;
127 		++s;
128 	}
129 	return 1;
130 }
131 
132 /* TODO: pool allocator for flow nodes */
133 /* TODO: store text by pointing to a giant buffer */
134 
fz_drop_html_flow(fz_context * ctx,fz_html_flow * flow)135 static void fz_drop_html_flow(fz_context *ctx, fz_html_flow *flow)
136 {
137 	while (flow)
138 	{
139 		fz_html_flow *next = flow->next;
140 		if (flow->type == FLOW_IMAGE)
141 			fz_drop_image(ctx, flow->content.image);
142 		flow = next;
143 	}
144 }
145 
add_flow(fz_context * ctx,fz_pool * pool,fz_html_box * top,fz_html_box * inline_box,int type,int extras)146 static fz_html_flow *add_flow(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, int type, int extras)
147 {
148 	size_t size = (type == FLOW_IMAGE ? sizeof(fz_html_flow) : offsetof(fz_html_flow, content) + extras);
149 	fz_html_flow *flow = fz_pool_alloc(ctx, pool, size);
150 	flow->type = type;
151 	flow->expand = 0;
152 	flow->bidi_level = 0;
153 	flow->markup_lang = 0;
154 	flow->breaks_line = 0;
155 	flow->box = inline_box;
156 	*top->flow_tail = flow;
157 	top->flow_tail = &flow->next;
158 	return flow;
159 }
160 
add_flow_space(fz_context * ctx,fz_pool * pool,fz_html_box * top,fz_html_box * inline_box)161 static void add_flow_space(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
162 {
163 	fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_SPACE, 0);
164 	flow->expand = 1;
165 }
166 
add_flow_break(fz_context * ctx,fz_pool * pool,fz_html_box * top,fz_html_box * inline_box)167 static void add_flow_break(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
168 {
169 	(void)add_flow(ctx, pool, top, inline_box, FLOW_BREAK, 0);
170 }
171 
add_flow_sbreak(fz_context * ctx,fz_pool * pool,fz_html_box * top,fz_html_box * inline_box)172 static void add_flow_sbreak(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
173 {
174 	(void)add_flow(ctx, pool, top, inline_box, FLOW_SBREAK, 0);
175 }
176 
add_flow_shyphen(fz_context * ctx,fz_pool * pool,fz_html_box * top,fz_html_box * inline_box)177 static void add_flow_shyphen(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
178 {
179 	(void)add_flow(ctx, pool, top, inline_box, FLOW_SHYPHEN, 0);
180 }
181 
add_flow_word(fz_context * ctx,fz_pool * pool,fz_html_box * top,fz_html_box * inline_box,const char * a,const char * b,int lang)182 static void add_flow_word(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, const char *a, const char *b, int lang)
183 {
184 	fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_WORD, b - a + 1);
185 	memcpy(flow->content.text, a, b - a);
186 	flow->content.text[b - a] = 0;
187 	flow->markup_lang = lang;
188 }
189 
add_flow_image(fz_context * ctx,fz_pool * pool,fz_html_box * top,fz_html_box * inline_box,fz_image * img)190 static void add_flow_image(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, fz_image *img)
191 {
192 	fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_IMAGE, 0);
193 	flow->content.image = fz_keep_image(ctx, img);
194 }
195 
add_flow_anchor(fz_context * ctx,fz_pool * pool,fz_html_box * top,fz_html_box * inline_box)196 static void add_flow_anchor(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box)
197 {
198 	(void)add_flow(ctx, pool, top, inline_box, FLOW_ANCHOR, 0);
199 }
200 
split_flow(fz_context * ctx,fz_pool * pool,fz_html_flow * flow,size_t offset)201 static fz_html_flow *split_flow(fz_context *ctx, fz_pool *pool, fz_html_flow *flow, size_t offset)
202 {
203 	fz_html_flow *new_flow;
204 	char *text;
205 	size_t len;
206 
207 	assert(flow->type == FLOW_WORD);
208 
209 	if (offset == 0)
210 		return flow;
211 	text = flow->content.text;
212 	while (*text && offset)
213 	{
214 		int rune;
215 		text += fz_chartorune(&rune, text);
216 		offset--;
217 	}
218 	len = strlen(text);
219 	new_flow = fz_pool_alloc(ctx, pool, offsetof(fz_html_flow, content) + len+1);
220 	memcpy(new_flow, flow, offsetof(fz_html_flow, content));
221 	new_flow->next = flow->next;
222 	flow->next = new_flow;
223 	strcpy(new_flow->content.text, text);
224 	*text = 0;
225 	return new_flow;
226 }
227 
flush_space(fz_context * ctx,fz_html_box * flow,fz_html_box * inline_box,int lang,struct genstate * g)228 static void flush_space(fz_context *ctx, fz_html_box *flow, fz_html_box *inline_box, int lang, struct genstate *g)
229 {
230 	static const char *space = " ";
231 	int bsp = inline_box->style->white_space & WS_ALLOW_BREAK_SPACE;
232 	fz_pool *pool = g->pool;
233 	if (g->emit_white)
234 	{
235 		if (!g->at_bol)
236 		{
237 			if (bsp)
238 				add_flow_space(ctx, pool, flow, inline_box);
239 			else
240 				add_flow_word(ctx, pool, flow, inline_box, space, space+1, lang);
241 		}
242 		g->emit_white = 0;
243 	}
244 }
245 
246 /* pair-wise lookup table for UAX#14 linebreaks */
247 static const char *pairbrk[29] =
248 {
249 /*	-OCCQGNESIPPNAHIIHBBBZCWHHJJJR- */
250 /*	-PLPULSXYSROULLDNYAB2WMJ23LVTI- */
251 	"^^^^^^^^^^^^^^^^^^^^^^^^^^^^^", /* OP open punctuation */
252 	"_^^%%^^^^%%_____%%__^^^______", /* CL close punctuation */
253 	"_^^%%^^^^%%%%%__%%__^^^______", /* CP close parenthesis */
254 	"^^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* QU quotation */
255 	"%^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* GL non-breaking glue */
256 	"_^^%%%^^^_______%%__^^^______", /* NS nonstarters */
257 	"_^^%%%^^^______%%%__^^^______", /* EX exclamation/interrogation */
258 	"_^^%%%^^^__%_%__%%__^^^______", /* SY symbols allowing break after */
259 	"_^^%%%^^^__%%%__%%__^^^______", /* IS infix numeric separator */
260 	"%^^%%%^^^__%%%%_%%__^^^%%%%%_", /* PR prefix numeric */
261 	"%^^%%%^^^__%%%__%%__^^^______", /* PO postfix numeric */
262 	"%^^%%%^^^%%%%%_%%%__^^^______", /* NU numeric */
263 	"%^^%%%^^^__%%%_%%%__^^^______", /* AL ordinary alphabetic and symbol characters */
264 	"%^^%%%^^^__%%%_%%%__^^^______", /* HL hebrew letter */
265 	"_^^%%%^^^_%____%%%__^^^______", /* ID ideographic */
266 	"_^^%%%^^^______%%%__^^^______", /* IN inseparable characters */
267 	"_^^%_%^^^__%____%%__^^^______", /* HY hyphens */
268 	"_^^%_%^^^_______%%__^^^______", /* BA break after */
269 	"%^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* BB break before */
270 	"_^^%%%^^^_______%%_^^^^______", /* B2 break opportunity before and after */
271 	"____________________^________", /* ZW zero width space */
272 	"%^^%%%^^^__%%%_%%%__^^^______", /* CM combining mark */
273 	"%^^%%%^^^%%%%%%%%%%%^^^%%%%%%", /* WJ word joiner */
274 	"_^^%%%^^^_%____%%%__^^^___%%_", /* H2 hangul leading/vowel syllable */
275 	"_^^%%%^^^_%____%%%__^^^____%_", /* H3 hangul leading/vowel/trailing syllable */
276 	"_^^%%%^^^_%____%%%__^^^%%%%__", /* JL hangul leading jamo */
277 	"_^^%%%^^^_%____%%%__^^^___%%_", /* JV hangul vowel jamo */
278 	"_^^%%%^^^_%____%%%__^^^____%_", /* JT hangul trailing jamo */
279 	"_^^%%%^^^_______%%__^^^_____%", /* RI regional indicator */
280 };
281 
generate_text(fz_context * ctx,fz_html_box * box,const char * text,int lang,struct genstate * g)282 static void generate_text(fz_context *ctx, fz_html_box *box, const char *text, int lang, struct genstate *g)
283 {
284 	fz_html_box *flow;
285 	fz_pool *pool = g->pool;
286 	int collapse = box->style->white_space & WS_COLLAPSE;
287 	int bsp = box->style->white_space & WS_ALLOW_BREAK_SPACE;
288 	int bnl = box->style->white_space & WS_FORCE_BREAK_NEWLINE;
289 
290 	static const char *space = " ";
291 
292 	flow = box;
293 	while (flow->type != BOX_FLOW)
294 		flow = flow->up;
295 
296 	while (*text)
297 	{
298 		if (bnl && (*text == '\n' || *text == '\r'))
299 		{
300 			if (text[0] == '\r' && text[1] == '\n')
301 				text += 2;
302 			else
303 				text += 1;
304 			add_flow_break(ctx, pool, flow, box);
305 			g->at_bol = 1;
306 		}
307 		else if (iswhite(*text))
308 		{
309 			if (collapse)
310 			{
311 				if (bnl)
312 					while (*text == ' ' || *text == '\t')
313 						++text;
314 				else
315 					while (iswhite(*text))
316 						++text;
317 				g->emit_white = 1;
318 			}
319 			else
320 			{
321 				// TODO: tabs
322 				if (bsp)
323 					add_flow_space(ctx, pool, flow, box);
324 				else
325 					add_flow_word(ctx, pool, flow, box, space, space+1, lang);
326 				++text;
327 			}
328 			g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a space */
329 		}
330 		else
331 		{
332 			const char *prev, *mark = text;
333 			int c;
334 
335 			flush_space(ctx, flow, box, lang, g);
336 
337 			if (g->at_bol)
338 				g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ;
339 
340 			while (*text && !iswhite(*text))
341 			{
342 				prev = text;
343 				text += fz_chartorune(&c, text);
344 				if (c == 0xAD) /* soft hyphen */
345 				{
346 					if (mark != prev)
347 						add_flow_word(ctx, pool, flow, box, mark, prev, lang);
348 					add_flow_shyphen(ctx, pool, flow, box);
349 					mark = text;
350 					g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a soft hyphen */
351 				}
352 				else if (bsp) /* allow soft breaks */
353 				{
354 					int this_brk_cls = ucdn_get_resolved_linebreak_class(c);
355 					if (this_brk_cls < UCDN_LINEBREAK_CLASS_RI)
356 					{
357 						int brk = pairbrk[g->last_brk_cls][this_brk_cls];
358 
359 						/* we handle spaces elsewhere, so ignore these classes */
360 						if (brk == '@') brk = '^';
361 						if (brk == '#') brk = '^';
362 						if (brk == '%') brk = '^';
363 
364 						if (brk == '_')
365 						{
366 							if (mark != prev)
367 								add_flow_word(ctx, pool, flow, box, mark, prev, lang);
368 							add_flow_sbreak(ctx, pool, flow, box);
369 							mark = prev;
370 						}
371 
372 						g->last_brk_cls = this_brk_cls;
373 					}
374 				}
375 			}
376 			if (mark != text)
377 				add_flow_word(ctx, pool, flow, box, mark, text, lang);
378 
379 			g->at_bol = 0;
380 		}
381 	}
382 }
383 
load_html_image(fz_context * ctx,fz_archive * zip,const char * base_uri,const char * src)384 static fz_image *load_html_image(fz_context *ctx, fz_archive *zip, const char *base_uri, const char *src)
385 {
386 	char path[2048];
387 	fz_image *img = NULL;
388 	fz_buffer *buf = NULL;
389 
390 	fz_var(img);
391 	fz_var(buf);
392 
393 	fz_try(ctx)
394 	{
395 		if (!strncmp(src, "data:image/jpeg;base64,", 23))
396 			buf = fz_new_buffer_from_base64(ctx, src+23, 0);
397 		else if (!strncmp(src, "data:image/png;base64,", 22))
398 			buf = fz_new_buffer_from_base64(ctx, src+22, 0);
399 		else if (!strncmp(src, "data:image/gif;base64,", 22))
400 			buf = fz_new_buffer_from_base64(ctx, src+22, 0);
401 		else
402 		{
403 			fz_strlcpy(path, base_uri, sizeof path);
404 			fz_strlcat(path, "/", sizeof path);
405 			fz_strlcat(path, src, sizeof path);
406 			fz_urldecode(path);
407 			fz_cleanname(path);
408 			buf = fz_read_archive_entry(ctx, zip, path);
409 		}
410 #if FZ_ENABLE_SVG
411 		if (strstr(src, ".svg"))
412 			img = fz_new_image_from_svg(ctx, buf, base_uri, zip);
413 		else
414 #endif
415 			img = fz_new_image_from_buffer(ctx, buf);
416 	}
417 	fz_always(ctx)
418 		fz_drop_buffer(ctx, buf);
419 	fz_catch(ctx)
420 		fz_warn(ctx, "html: cannot load image src='%s'", src);
421 
422 	return img;
423 }
424 
load_svg_image(fz_context * ctx,fz_archive * zip,const char * base_uri,fz_xml * xml)425 static fz_image *load_svg_image(fz_context *ctx, fz_archive *zip, const char *base_uri, fz_xml *xml)
426 {
427 	fz_image *img = NULL;
428 	fz_try(ctx)
429 		img = fz_new_image_from_svg_xml(ctx, xml, base_uri, zip);
430 	fz_catch(ctx)
431 		fz_warn(ctx, "html: cannot load embedded svg document");
432 	return img;
433 }
434 
generate_anchor(fz_context * ctx,fz_html_box * box,struct genstate * g)435 static void generate_anchor(fz_context *ctx, fz_html_box *box, struct genstate *g)
436 {
437 	fz_pool *pool = g->pool;
438 	fz_html_box *flow = box;
439 	while (flow->type != BOX_FLOW)
440 		flow = flow->up;
441 	add_flow_anchor(ctx, pool, flow, box);
442 }
443 
generate_image(fz_context * ctx,fz_html_box * box,fz_image * img,struct genstate * g)444 static void generate_image(fz_context *ctx, fz_html_box *box, fz_image *img, struct genstate *g)
445 {
446 	fz_html_box *flow = box;
447 	fz_pool *pool = g->pool;
448 	while (flow->type != BOX_FLOW)
449 		flow = flow->up;
450 
451 	flush_space(ctx, flow, box, 0, g);
452 
453 	if (!img)
454 	{
455 		const char *alt = "[image]";
456 		add_flow_word(ctx, pool, flow, box, alt, alt + 7, 0);
457 	}
458 	else
459 	{
460 		fz_try(ctx)
461 		{
462 			add_flow_sbreak(ctx, pool, flow, box);
463 			add_flow_image(ctx, pool, flow, box, img);
464 			add_flow_sbreak(ctx, pool, flow, box);
465 		}
466 		fz_always(ctx)
467 		{
468 			fz_drop_image(ctx, img);
469 		}
470 		fz_catch(ctx)
471 			fz_rethrow(ctx);
472 	}
473 
474 	g->at_bol = 0;
475 }
476 
init_box(fz_context * ctx,fz_html_box * box,fz_bidi_direction markup_dir)477 static void init_box(fz_context *ctx, fz_html_box *box, fz_bidi_direction markup_dir)
478 {
479 	box->type = BOX_BLOCK;
480 	box->x = box->y = 0;
481 	box->w = box->b = 0;
482 
483 	box->up = NULL;
484 	box->down = NULL;
485 	box->next = NULL;
486 
487 	box->flow_head = NULL;
488 	box->flow_tail = &box->flow_head;
489 	box->markup_dir = markup_dir;
490 	box->style = NULL;
491 }
492 
fz_drop_html_box(fz_context * ctx,fz_html_box * box)493 static void fz_drop_html_box(fz_context *ctx, fz_html_box *box)
494 {
495 	while (box)
496 	{
497 		fz_html_box *next = box->next;
498 		fz_drop_html_flow(ctx, box->flow_head);
499 		fz_drop_html_box(ctx, box->down);
500 		box = next;
501 	}
502 }
503 
fz_drop_html_imp(fz_context * ctx,fz_storable * stor)504 static void fz_drop_html_imp(fz_context *ctx, fz_storable *stor)
505 {
506 	fz_html *html = (fz_html *)stor;
507 	fz_drop_html_box(ctx, html->root);
508 	fz_drop_pool(ctx, html->pool);
509 }
510 
fz_drop_html(fz_context * ctx,fz_html * html)511 void fz_drop_html(fz_context *ctx, fz_html *html)
512 {
513 	fz_defer_reap_start(ctx);
514 	fz_drop_storable(ctx, &html->storable);
515 	fz_defer_reap_end(ctx);
516 }
517 
fz_keep_html(fz_context * ctx,fz_html * html)518 fz_html *fz_keep_html(fz_context *ctx, fz_html *html)
519 {
520 	return fz_keep_storable(ctx, &html->storable);
521 }
522 
new_box(fz_context * ctx,fz_pool * pool,fz_bidi_direction markup_dir)523 static fz_html_box *new_box(fz_context *ctx, fz_pool *pool, fz_bidi_direction markup_dir)
524 {
525 	fz_html_box *box = fz_pool_alloc(ctx, pool, sizeof *box);
526 	init_box(ctx, box, markup_dir);
527 	return box;
528 }
529 
new_short_box(fz_context * ctx,fz_pool * pool,fz_bidi_direction markup_dir)530 static fz_html_box *new_short_box(fz_context *ctx, fz_pool *pool, fz_bidi_direction markup_dir)
531 {
532 	fz_html_box *box = fz_pool_alloc(ctx, pool, offsetof(fz_html_box, padding));
533 	init_box(ctx, box, markup_dir);
534 	return box;
535 }
536 
insert_box(fz_context * ctx,fz_html_box * box,int type,fz_html_box * top)537 static void insert_box(fz_context *ctx, fz_html_box *box, int type, fz_html_box *top)
538 {
539 	box->type = type;
540 
541 	box->up = top;
542 
543 	if (top)
544 	{
545 		/* Here 'next' really means 'last of my children'. This will
546 		 * be fixed up in a pass at the end of parsing. */
547 		if (!top->next)
548 		{
549 			top->down = top->next = box;
550 		}
551 		else
552 		{
553 			top->next->next = box;
554 			/* Here next actually means next */
555 			top->next = box;
556 		}
557 	}
558 }
559 
insert_block_box(fz_context * ctx,fz_html_box * box,fz_html_box * top)560 static fz_html_box *insert_block_box(fz_context *ctx, fz_html_box *box, fz_html_box *top)
561 {
562 	if (top->type == BOX_BLOCK)
563 	{
564 		insert_box(ctx, box, BOX_BLOCK, top);
565 	}
566 	else if (top->type == BOX_FLOW)
567 	{
568 		while (top->type != BOX_BLOCK)
569 			top = top->up;
570 		insert_box(ctx, box, BOX_BLOCK, top);
571 	}
572 	else if (top->type == BOX_INLINE)
573 	{
574 		while (top->type != BOX_BLOCK)
575 			top = top->up;
576 		insert_box(ctx, box, BOX_BLOCK, top);
577 	}
578 	return top;
579 }
580 
insert_table_box(fz_context * ctx,fz_html_box * box,fz_html_box * top)581 static fz_html_box *insert_table_box(fz_context *ctx, fz_html_box *box, fz_html_box *top)
582 {
583 	top = insert_block_box(ctx, box, top);
584 	box->type = BOX_TABLE;
585 	return top;
586 }
587 
insert_table_row_box(fz_context * ctx,fz_html_box * box,fz_html_box * top)588 static fz_html_box *insert_table_row_box(fz_context *ctx, fz_html_box *box, fz_html_box *top)
589 {
590 	fz_html_box *table = top;
591 	while (table && table->type != BOX_TABLE)
592 		table = table->up;
593 	if (table)
594 	{
595 		insert_box(ctx, box, BOX_TABLE_ROW, table);
596 		return table;
597 	}
598 	fz_warn(ctx, "table-row not inside table element");
599 	insert_block_box(ctx, box, top);
600 	return top;
601 }
602 
insert_table_cell_box(fz_context * ctx,fz_html_box * box,fz_html_box * top)603 static fz_html_box *insert_table_cell_box(fz_context *ctx, fz_html_box *box, fz_html_box *top)
604 {
605 	fz_html_box *tr = top;
606 	while (tr && tr->type != BOX_TABLE_ROW)
607 		tr = tr->up;
608 	if (tr)
609 	{
610 		insert_box(ctx, box, BOX_TABLE_CELL, tr);
611 		return tr;
612 	}
613 	fz_warn(ctx, "table-cell not inside table-row element");
614 	insert_block_box(ctx, box, top);
615 	return top;
616 }
617 
insert_inline_box(fz_context * ctx,fz_html_box * box,fz_html_box * top,int markup_dir,struct genstate * g)618 static void insert_inline_box(fz_context *ctx, fz_html_box *box, fz_html_box *top, int markup_dir, struct genstate *g)
619 {
620 	if (top->type == BOX_FLOW || top->type == BOX_INLINE)
621 	{
622 		insert_box(ctx, box, BOX_INLINE, top);
623 	}
624 	else
625 	{
626 		while (top->type != BOX_BLOCK && top->type != BOX_TABLE_CELL)
627 			top = top->up;
628 
629 		/* Here 'next' actually means 'last of my children' */
630 		if (top->next && top->next->type == BOX_FLOW)
631 		{
632 			insert_box(ctx, box, BOX_INLINE, top->next);
633 		}
634 		else
635 		{
636 			fz_css_style style;
637 			fz_html_box *flow = new_short_box(ctx, g->pool, markup_dir);
638 			flow->is_first_flow = !top->next;
639 			fz_default_css_style(ctx, &style);
640 			flow->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
641 			insert_box(ctx, flow, BOX_FLOW, top);
642 			insert_box(ctx, box, BOX_INLINE, flow);
643 			g->at_bol = 1;
644 		}
645 	}
646 }
647 
648 static fz_html_box *
generate_boxes(fz_context * ctx,fz_xml * node,fz_html_box * top,fz_css_match * up_match,int list_counter,int section_depth,int markup_dir,int markup_lang,struct genstate * g)649 generate_boxes(fz_context *ctx,
650 	fz_xml *node,
651 	fz_html_box *top,
652 	fz_css_match *up_match,
653 	int list_counter,
654 	int section_depth,
655 	int markup_dir,
656 	int markup_lang,
657 	struct genstate *g)
658 {
659 	fz_html_box *box, *last_top;
660 	const char *tag;
661 	int display;
662 	fz_css_style style;
663 
664 	while (node)
665 	{
666 
667 		tag = fz_xml_tag(node);
668 		if (tag)
669 		{
670 			fz_css_match match;
671 
672 			fz_match_css(ctx, &match, up_match, g->css, node);
673 
674 			display = fz_get_css_match_display(&match);
675 
676 			fz_apply_css_style(ctx, g->set, &style, &match);
677 
678 			if (tag[0]=='b' && tag[1]=='r' && tag[2]==0)
679 			{
680 				fz_html_box *flow;
681 				if (top->type != BOX_INLINE)
682 				{
683 					/* Create anonymous inline box, with the same style as the top block box. */
684 					fz_css_style style;
685 					box = new_short_box(ctx, g->pool, markup_dir);
686 					fz_default_css_style(ctx, &style);
687 					box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
688 					insert_inline_box(ctx, box, top, markup_dir, g);
689 					style = *top->style;
690 					/* Make sure not to recursively multiply font sizes. */
691 					style.font_size.value = 1;
692 					style.font_size.unit = N_SCALE;
693 					box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
694 					flow = box;
695 					while (flow->type != BOX_FLOW)
696 						flow = flow->up;
697 					add_flow_break(ctx, g->pool, flow, box);
698 				}
699 				else
700 				{
701 					flow = top;
702 					while (flow->type != BOX_FLOW)
703 						flow = flow->up;
704 					add_flow_break(ctx, g->pool, flow, top);
705 				}
706 				g->at_bol = 1;
707 			}
708 
709 			else if (tag[0]=='i' && tag[1]=='m' && tag[2]=='g' && tag[3]==0)
710 			{
711 				const char *src = fz_xml_att(node, "src");
712 				if (src)
713 				{
714 					int w, h;
715 					const char *w_att = fz_xml_att(node, "width");
716 					const char *h_att = fz_xml_att(node, "height");
717 					if (w_att && (w = fz_atoi(w_att)) > 0)
718 					{
719 						style.width.value = w;
720 						style.width.unit = strchr(w_att, '%') ? N_PERCENT : N_LENGTH;
721 					}
722 					if (h_att && (h = fz_atoi(h_att)) > 0)
723 					{
724 						style.height.value = h;
725 						style.height.unit = strchr(h_att, '%') ? N_PERCENT : N_LENGTH;
726 					}
727 
728 					if (display == DIS_BLOCK)
729 					{
730 						fz_html_box *imgbox;
731 						box = new_box(ctx, g->pool, markup_dir);
732 						box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
733 						top = insert_block_box(ctx, box, top);
734 						imgbox = new_short_box(ctx, g->pool, markup_dir);
735 						imgbox->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
736 						insert_inline_box(ctx, imgbox, box, markup_dir, g);
737 						generate_image(ctx, imgbox, load_html_image(ctx, g->zip, g->base_uri, src), g);
738 					}
739 					else if (display == DIS_INLINE)
740 					{
741 						box = new_short_box(ctx, g->pool, markup_dir);
742 						box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
743 						insert_inline_box(ctx, box, top, markup_dir, g);
744 						generate_image(ctx, box, load_html_image(ctx, g->zip, g->base_uri, src), g);
745 					}
746 				}
747 			}
748 
749 			else if (tag[0]=='s' && tag[1]=='v' && tag[2]=='g' && tag[3]==0)
750 			{
751 				box = new_short_box(ctx, g->pool, markup_dir);
752 				box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
753 				insert_inline_box(ctx, box, top, markup_dir, g);
754 				generate_image(ctx, box, load_svg_image(ctx, g->zip, g->base_uri, node), g);
755 			}
756 
757 			else if (g->is_fb2 && tag[0]=='i' && tag[1]=='m' && tag[2]=='a' && tag[3]=='g' && tag[4]=='e' && tag[5]==0)
758 			{
759 				const char *src = fz_xml_att(node, "l:href");
760 				if (!src)
761 					src = fz_xml_att(node, "xlink:href");
762 				if (src && src[0] == '#')
763 				{
764 					fz_image *img = fz_tree_lookup(ctx, g->images, src+1);
765 					if (display == DIS_BLOCK)
766 					{
767 						fz_html_box *imgbox;
768 						box = new_box(ctx, g->pool, markup_dir);
769 						box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
770 						top = insert_block_box(ctx, box, top);
771 						imgbox = new_short_box(ctx, g->pool, markup_dir);
772 						imgbox->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
773 						insert_inline_box(ctx, imgbox, box, markup_dir, g);
774 						generate_image(ctx, imgbox, fz_keep_image(ctx, img), g);
775 					}
776 					else if (display == DIS_INLINE)
777 					{
778 						box = new_short_box(ctx, g->pool, markup_dir);
779 						box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
780 						insert_inline_box(ctx, box, top, markup_dir, g);
781 						generate_image(ctx, box, fz_keep_image(ctx, img), g);
782 					}
783 				}
784 			}
785 
786 			else if (display != DIS_NONE)
787 			{
788 				const char *dir, *lang, *id, *href;
789 				int child_dir = markup_dir;
790 				int child_lang = markup_lang;
791 
792 				dir = fz_xml_att(node, "dir");
793 				if (dir)
794 				{
795 					if (!strcmp(dir, "auto"))
796 						child_dir = FZ_BIDI_NEUTRAL;
797 					else if (!strcmp(dir, "rtl"))
798 						child_dir = FZ_BIDI_RTL;
799 					else if (!strcmp(dir, "ltr"))
800 						child_dir = FZ_BIDI_LTR;
801 					else
802 						child_dir = DEFAULT_DIR;
803 				}
804 
805 				lang = fz_xml_att(node, "lang");
806 				if (lang)
807 					child_lang = fz_text_language_from_string(lang);
808 
809 				if (display == DIS_INLINE)
810 					box = new_short_box(ctx, g->pool, child_dir);
811 				else
812 					box = new_box(ctx, g->pool, child_dir);
813 				box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
814 
815 				id = fz_xml_att(node, "id");
816 				if (id)
817 					box->id = fz_pool_strdup(ctx, g->pool, id);
818 
819 				if (display == DIS_BLOCK || display == DIS_INLINE_BLOCK)
820 				{
821 					top = insert_block_box(ctx, box, top);
822 					if (g->is_fb2)
823 					{
824 						if (!strcmp(tag, "title") || !strcmp(tag, "subtitle"))
825 							box->heading = fz_mini(section_depth, 6);
826 					}
827 					else
828 					{
829 						if (tag[0]=='h' && tag[1]>='1' && tag[1]<='6' && tag[2]==0)
830 							box->heading = tag[1] - '0';
831 					}
832 				}
833 				else if (display == DIS_LIST_ITEM)
834 				{
835 					top = insert_block_box(ctx, box, top);
836 					box->list_item = ++list_counter;
837 				}
838 				else if (display == DIS_INLINE)
839 				{
840 					insert_inline_box(ctx, box, top, child_dir, g);
841 					if (id)
842 						generate_anchor(ctx, box, g);
843 					if (tag[0]=='a' && tag[1]==0)
844 					{
845 						if (g->is_fb2)
846 						{
847 							href = fz_xml_att(node, "l:href");
848 							if (!href)
849 								href = fz_xml_att(node, "xlink:href");
850 						}
851 						else
852 							href = fz_xml_att(node, g->is_fb2 ? "l:href" : "href");
853 						if (href)
854 							box->href = fz_pool_strdup(ctx, g->pool, href);
855 					}
856 				}
857 				else if (display == DIS_TABLE)
858 				{
859 					top = insert_table_box(ctx, box, top);
860 				}
861 				else if (display == DIS_TABLE_ROW)
862 				{
863 					top = insert_table_row_box(ctx, box, top);
864 				}
865 				else if (display == DIS_TABLE_CELL)
866 				{
867 					top = insert_table_cell_box(ctx, box, top);
868 				}
869 				else
870 				{
871 					fz_warn(ctx, "unknown box display type");
872 					insert_box(ctx, box, BOX_BLOCK, top);
873 				}
874 
875 				if (fz_xml_down(node))
876 				{
877 					int child_counter = list_counter;
878 					int child_section = section_depth;
879 					if (!strcmp(tag, "ul") || !strcmp(tag, "ol"))
880 						child_counter = 0;
881 					else if (!strcmp(tag, "section"))
882 						++child_section;
883 					last_top = generate_boxes(ctx,
884 						fz_xml_down(node),
885 						box,
886 						&match,
887 						child_counter,
888 						child_section,
889 						child_dir,
890 						child_lang,
891 						g);
892 					if (last_top != box)
893 						top = last_top;
894 				}
895 			}
896 		}
897 		else
898 		{
899 			const char *text = fz_xml_text(node);
900 			int collapse = top->style->white_space & WS_COLLAPSE;
901 			if (collapse && is_all_white(text))
902 			{
903 				g->emit_white = 1;
904 			}
905 			else
906 			{
907 				if (top->type != BOX_INLINE)
908 				{
909 					/* Create anonymous inline box, with the same style as the top block box. */
910 					fz_css_style style;
911 					box = new_short_box(ctx, g->pool, markup_dir);
912 					fz_default_css_style(ctx, &style);
913 					box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
914 					insert_inline_box(ctx, box, top, markup_dir, g);
915 					style = *top->style;
916 					/* Make sure not to recursively multiply font sizes. */
917 					style.font_size.value = 1;
918 					style.font_size.unit = N_SCALE;
919 					box->style = fz_css_enlist(ctx, &style, &g->styles, g->pool);
920 					generate_text(ctx, box, text, markup_lang, g);
921 				}
922 				else
923 				{
924 					generate_text(ctx, top, text, markup_lang, g);
925 				}
926 			}
927 		}
928 
929 		node = fz_xml_next(node);
930 	}
931 
932 	return top;
933 }
934 
concat_text(fz_context * ctx,fz_xml * root)935 static char *concat_text(fz_context *ctx, fz_xml *root)
936 {
937 	fz_xml *node;
938 	size_t i = 0, n = 1;
939 	char *s;
940 	for (node = fz_xml_down(root); node; node = fz_xml_next(node))
941 	{
942 		const char *text = fz_xml_text(node);
943 		n += text ? strlen(text) : 0;
944 	}
945 	s = Memento_label(fz_malloc(ctx, n), "concat_html");
946 	for (node = fz_xml_down(root); node; node = fz_xml_next(node))
947 	{
948 		const char *text = fz_xml_text(node);
949 		if (text)
950 		{
951 			n = strlen(text);
952 			memcpy(s+i, text, n);
953 			i += n;
954 		}
955 	}
956 	s[i] = 0;
957 	return s;
958 }
959 
960 static void
html_load_css_link(fz_context * ctx,fz_html_font_set * set,fz_archive * zip,const char * base_uri,fz_css * css,fz_xml * root,const char * href)961 html_load_css_link(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root, const char *href)
962 {
963 	char path[2048];
964 	char css_base_uri[2048];
965 	fz_buffer *buf;
966 
967 	fz_var(buf);
968 
969 	fz_strlcpy(path, base_uri, sizeof path);
970 	fz_strlcat(path, "/", sizeof path);
971 	fz_strlcat(path, href, sizeof path);
972 	fz_urldecode(path);
973 	fz_cleanname(path);
974 
975 	fz_dirname(css_base_uri, path, sizeof css_base_uri);
976 
977 	buf = NULL;
978 	fz_try(ctx)
979 	{
980 		buf = fz_read_archive_entry(ctx, zip, path);
981 		fz_parse_css(ctx, css, fz_string_from_buffer(ctx, buf), path);
982 		fz_add_css_font_faces(ctx, set, zip, css_base_uri, css);
983 	}
984 	fz_always(ctx)
985 		fz_drop_buffer(ctx, buf);
986 	fz_catch(ctx)
987 		fz_warn(ctx, "ignoring stylesheet %s", path);
988 }
989 
990 static void
html_load_css(fz_context * ctx,fz_html_font_set * set,fz_archive * zip,const char * base_uri,fz_css * css,fz_xml * root)991 html_load_css(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root)
992 {
993 	fz_xml *html, *head, *node;
994 
995 	html = fz_xml_find(root, "html");
996 	head = fz_xml_find_down(html, "head");
997 	for (node = fz_xml_down(head); node; node = fz_xml_next(node))
998 	{
999 		if (fz_xml_is_tag(node, "link"))
1000 		{
1001 			char *rel = fz_xml_att(node, "rel");
1002 			if (rel && !fz_strcasecmp(rel, "stylesheet"))
1003 			{
1004 				char *type = fz_xml_att(node, "type");
1005 				if ((type && !strcmp(type, "text/css")) || !type)
1006 				{
1007 					char *href = fz_xml_att(node, "href");
1008 					if (href)
1009 					{
1010 						html_load_css_link(ctx, set, zip, base_uri, css, root, href);
1011 					}
1012 				}
1013 			}
1014 		}
1015 		else if (fz_xml_is_tag(node, "style"))
1016 		{
1017 			char *s = concat_text(ctx, node);
1018 			fz_try(ctx)
1019 			{
1020 				fz_parse_css(ctx, css, s, "<style>");
1021 				fz_add_css_font_faces(ctx, set, zip, base_uri, css);
1022 			}
1023 			fz_catch(ctx)
1024 				fz_warn(ctx, "ignoring inline stylesheet");
1025 			fz_free(ctx, s);
1026 		}
1027 	}
1028 }
1029 
1030 static void
fb2_load_css(fz_context * ctx,fz_html_font_set * set,fz_archive * zip,const char * base_uri,fz_css * css,fz_xml * root)1031 fb2_load_css(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root)
1032 {
1033 	fz_xml *fictionbook, *stylesheet;
1034 
1035 	fictionbook = fz_xml_find(root, "FictionBook");
1036 	stylesheet = fz_xml_find_down(fictionbook, "stylesheet");
1037 	if (stylesheet)
1038 	{
1039 		char *s = concat_text(ctx, stylesheet);
1040 		fz_try(ctx)
1041 		{
1042 			fz_parse_css(ctx, css, s, "<stylesheet>");
1043 			fz_add_css_font_faces(ctx, set, zip, base_uri, css);
1044 		}
1045 		fz_catch(ctx)
1046 			fz_warn(ctx, "ignoring inline stylesheet");
1047 		fz_free(ctx, s);
1048 	}
1049 }
1050 
1051 static fz_tree *
load_fb2_images(fz_context * ctx,fz_xml * root)1052 load_fb2_images(fz_context *ctx, fz_xml *root)
1053 {
1054 	fz_xml *fictionbook, *binary;
1055 	fz_tree *images = NULL;
1056 
1057 	fictionbook = fz_xml_find(root, "FictionBook");
1058 	for (binary = fz_xml_find_down(fictionbook, "binary"); binary; binary = fz_xml_find_next(binary, "binary"))
1059 	{
1060 		const char *id = fz_xml_att(binary, "id");
1061 		char *b64 = NULL;
1062 		fz_buffer *buf = NULL;
1063 		fz_image *img = NULL;
1064 
1065 		fz_var(b64);
1066 		fz_var(buf);
1067 
1068 		fz_try(ctx)
1069 		{
1070 			b64 = concat_text(ctx, binary);
1071 			buf = fz_new_buffer_from_base64(ctx, b64, strlen(b64));
1072 			img = fz_new_image_from_buffer(ctx, buf);
1073 		}
1074 		fz_always(ctx)
1075 		{
1076 			fz_drop_buffer(ctx, buf);
1077 			fz_free(ctx, b64);
1078 		}
1079 		fz_catch(ctx)
1080 			fz_rethrow(ctx);
1081 
1082 		images = fz_tree_insert(ctx, images, id, img);
1083 	}
1084 
1085 	return images;
1086 }
1087 
1088 typedef struct
1089 {
1090 	uint32_t *data;
1091 	size_t cap;
1092 	size_t len;
1093 } uni_buf;
1094 
1095 typedef struct
1096 {
1097 	fz_context *ctx;
1098 	fz_pool *pool;
1099 	fz_html_flow *flow;
1100 	uni_buf *buffer;
1101 } bidi_data;
1102 
fragment_cb(const uint32_t * fragment,size_t fragment_len,int bidi_level,int script,void * arg)1103 static void fragment_cb(const uint32_t *fragment,
1104 			size_t fragment_len,
1105 			int bidi_level,
1106 			int script,
1107 			void *arg)
1108 {
1109 	bidi_data *data = (bidi_data *)arg;
1110 	size_t fragment_offset = fragment - data->buffer->data;
1111 
1112 	/* We are guaranteed that fragmentOffset will be at the beginning
1113 	 * of flow. */
1114 	while (fragment_len > 0)
1115 	{
1116 		size_t len;
1117 
1118 		if (data->flow->type == FLOW_SPACE)
1119 		{
1120 			len = 1;
1121 		}
1122 		else if (data->flow->type == FLOW_BREAK || data->flow->type == FLOW_SBREAK ||
1123 				data->flow->type == FLOW_SHYPHEN || data->flow->type == FLOW_ANCHOR)
1124 		{
1125 			len = 0;
1126 		}
1127 		else
1128 		{
1129 			/* Must be text */
1130 			len = fz_utflen(data->flow->content.text);
1131 			if (len > fragment_len)
1132 			{
1133 				/* We need to split this flow box */
1134 				(void)split_flow(data->ctx, data->pool, data->flow, fragment_len);
1135 				len = fz_utflen(data->flow->content.text);
1136 			}
1137 		}
1138 
1139 		/* This flow box is entirely contained within this fragment. */
1140 		data->flow->bidi_level = bidi_level;
1141 		data->flow->script = script;
1142 		data->flow = data->flow->next;
1143 		fragment_offset += len;
1144 		fragment_len -= len;
1145 	}
1146 }
1147 
1148 static fz_bidi_direction
detect_flow_directionality(fz_context * ctx,fz_pool * pool,uni_buf * buffer,fz_bidi_direction bidi_dir,fz_html_flow * flow)1149 detect_flow_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_bidi_direction bidi_dir, fz_html_flow *flow)
1150 {
1151 	fz_html_flow *end = flow;
1152 	bidi_data data;
1153 
1154 	while (end)
1155 	{
1156 		int level = end->bidi_level;
1157 
1158 		/* Gather the text from the flow up into a single buffer (at
1159 		 * least, as much of it as has the same direction markup). */
1160 		buffer->len = 0;
1161 		while (end && (level & 1) == (end->bidi_level & 1))
1162 		{
1163 			size_t len = 0;
1164 			const char *text = "";
1165 			int broken = 0;
1166 
1167 			switch (end->type)
1168 			{
1169 			case FLOW_WORD:
1170 				len = fz_utflen(end->content.text);
1171 				text = end->content.text;
1172 				break;
1173 			case FLOW_SPACE:
1174 				len = 1;
1175 				text = " ";
1176 				break;
1177 			case FLOW_SHYPHEN:
1178 			case FLOW_SBREAK:
1179 				break;
1180 			case FLOW_BREAK:
1181 			case FLOW_IMAGE:
1182 				broken = 1;
1183 				break;
1184 			}
1185 
1186 			end = end->next;
1187 
1188 			if (broken)
1189 				break;
1190 
1191 			/* Make sure the buffer is large enough */
1192 			if (buffer->len + len > buffer->cap)
1193 			{
1194 				size_t newcap = buffer->cap;
1195 				if (newcap < 128)
1196 					newcap = 128; /* Sensible small default */
1197 
1198 				while (newcap < buffer->len + len)
1199 					newcap = (newcap * 3) / 2;
1200 
1201 				buffer->data = fz_realloc_array(ctx, buffer->data, newcap, uint32_t);
1202 				buffer->cap = newcap;
1203 			}
1204 
1205 			/* Expand the utf8 text into Unicode and store it in the buffer */
1206 			while (*text)
1207 			{
1208 				int rune;
1209 				text += fz_chartorune(&rune, text);
1210 				buffer->data[buffer->len++] = rune;
1211 			}
1212 		}
1213 
1214 		/* Detect directionality for the buffer */
1215 		data.ctx = ctx;
1216 		data.pool = pool;
1217 		data.flow = flow;
1218 		data.buffer = buffer;
1219 		fz_bidi_fragment_text(ctx, buffer->data, buffer->len, &bidi_dir, fragment_cb, &data, 0 /* Flags */);
1220 		flow = end;
1221 	}
1222 	return bidi_dir;
1223 }
1224 
1225 static void
detect_box_directionality(fz_context * ctx,fz_pool * pool,uni_buf * buffer,fz_html_box * box)1226 detect_box_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_html_box *box)
1227 {
1228 	while (box)
1229 	{
1230 		if (box->flow_head)
1231 			box->markup_dir = detect_flow_directionality(ctx, pool, buffer, box->markup_dir, box->flow_head);
1232 		detect_box_directionality(ctx, pool, buffer, box->down);
1233 		box = box->next;
1234 	}
1235 }
1236 
1237 static void
detect_directionality(fz_context * ctx,fz_pool * pool,fz_html_box * box)1238 detect_directionality(fz_context *ctx, fz_pool *pool, fz_html_box *box)
1239 {
1240 	uni_buf buffer = { NULL };
1241 
1242 	fz_try(ctx)
1243 		detect_box_directionality(ctx, pool, &buffer, box);
1244 	fz_always(ctx)
1245 		fz_free(ctx, buffer.data);
1246 	fz_catch(ctx)
1247 		fz_rethrow(ctx);
1248 }
1249 
1250 /* Here we look for places where box->next actually means
1251  * 'the last of my children', and correct it by setting
1252  * next == NULL. We can spot these because box->next->up == box. */
1253 static void
fix_nexts(fz_html_box * box)1254 fix_nexts(fz_html_box *box)
1255 {
1256 	while (box)
1257 	{
1258 		if (box->down)
1259 			fix_nexts(box->down);
1260 		if (box->next && box->next->up == box)
1261 		{
1262 			box->next = NULL;
1263 			break;
1264 		}
1265 		box = box->next;
1266 	}
1267 }
1268 
1269 static fz_html *
fz_parse_html_imp(fz_context * ctx,fz_html_font_set * set,fz_archive * zip,const char * base_uri,fz_buffer * buf,const char * user_css,int try_xml,int try_html5)1270 fz_parse_html_imp(fz_context *ctx,
1271 	fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css,
1272 	int try_xml, int try_html5)
1273 {
1274 	fz_xml_doc *xml;
1275 	fz_xml *root, *node;
1276 	fz_html *html = NULL;
1277 	char *title;
1278 
1279 	fz_css_match match;
1280 	struct genstate g;
1281 
1282 	g.pool = NULL;
1283 	g.set = set;
1284 	g.zip = zip;
1285 	g.images = NULL;
1286 	g.base_uri = base_uri;
1287 	g.css = NULL;
1288 	g.at_bol = 0;
1289 	g.emit_white = 0;
1290 	g.last_brk_cls = UCDN_LINEBREAK_CLASS_OP;
1291 	g.styles = NULL;
1292 
1293 	if (try_xml && try_html5)
1294 	{
1295 		fz_try(ctx)
1296 			xml = fz_parse_xml(ctx, buf, 1);
1297 		fz_catch(ctx)
1298 		{
1299 			if (fz_caught(ctx) == FZ_ERROR_SYNTAX)
1300 			{
1301 				fz_warn(ctx, "syntax error in XHTML; retrying using HTML5 parser");
1302 				xml = fz_parse_xml_from_html5(ctx, buf);
1303 			}
1304 			else
1305 				fz_rethrow(ctx);
1306 		}
1307 	}
1308 	else if (try_xml)
1309 		xml = fz_parse_xml(ctx, buf, 1);
1310 	else if (try_html5)
1311 		xml = fz_parse_xml_from_html5(ctx, buf);
1312 	else
1313 		return NULL; /* should never happen! */
1314 
1315 	root = fz_xml_root(xml);
1316 
1317 	fz_try(ctx)
1318 		g.css = fz_new_css(ctx);
1319 	fz_catch(ctx)
1320 	{
1321 		fz_drop_xml(ctx, xml);
1322 		fz_rethrow(ctx);
1323 	}
1324 
1325 #ifndef NDEBUG
1326 	if (fz_atoi(getenv("FZ_DEBUG_XML")))
1327 		fz_debug_xml(root, 0);
1328 #endif
1329 
1330 	fz_try(ctx)
1331 	{
1332 		if (fz_xml_find(root, "FictionBook"))
1333 		{
1334 			g.is_fb2 = 1;
1335 			fz_parse_css(ctx, g.css, fb2_default_css, "<default:fb2>");
1336 			if (fz_use_document_css(ctx))
1337 				fb2_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root);
1338 			g.images = load_fb2_images(ctx, root);
1339 		}
1340 		else
1341 		{
1342 			g.is_fb2 = 0;
1343 			fz_parse_css(ctx, g.css, html_default_css, "<default:html>");
1344 			if (fz_use_document_css(ctx))
1345 				html_load_css(ctx, g.set, g.zip, g.base_uri, g.css, root);
1346 			g.images = NULL;
1347 		}
1348 
1349 		if (user_css)
1350 		{
1351 			fz_parse_css(ctx, g.css, user_css, "<user>");
1352 			fz_add_css_font_faces(ctx, g.set, g.zip, ".", g.css);
1353 		}
1354 	}
1355 	fz_catch(ctx)
1356 	{
1357 		fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
1358 		fz_warn(ctx, "ignoring styles due to errors: %s", fz_caught_message(ctx));
1359 	}
1360 
1361 #ifndef NDEBUG
1362 	if (fz_atoi(getenv("FZ_DEBUG_CSS")))
1363 		fz_debug_css(ctx, g.css);
1364 #endif
1365 
1366 	fz_try(ctx)
1367 	{
1368 		fz_css_style style;
1369 
1370 		g.pool = fz_new_pool(ctx);
1371 		html = fz_pool_alloc(ctx, g.pool, sizeof *html);
1372 		FZ_INIT_STORABLE(html, 1, fz_drop_html_imp);
1373 		html->pool = g.pool;
1374 		html->root = new_box(ctx, g.pool, DEFAULT_DIR);
1375 		html->layout_w = 0;
1376 		html->layout_h = 0;
1377 		html->layout_em = 0;
1378 
1379 		fz_match_css_at_page(ctx, &match, g.css);
1380 		fz_apply_css_style(ctx, g.set, &style, &match);
1381 		html->root->style = fz_css_enlist(ctx, &style, &g.styles, g.pool);
1382 		// TODO: transfer page margins out of this hacky box
1383 
1384 		generate_boxes(ctx, root, html->root, &match, 0, 0, DEFAULT_DIR, FZ_LANG_UNSET, &g);
1385 		fix_nexts(html->root);
1386 
1387 		detect_directionality(ctx, g.pool, html->root);
1388 
1389 		if (g.is_fb2)
1390 		{
1391 			node = fz_xml_find(root, "FictionBook");
1392 			node = fz_xml_find_down(node, "description");
1393 			node = fz_xml_find_down(node, "title-info");
1394 			node = fz_xml_find_down(node, "book-title");
1395 			title = fz_xml_text(fz_xml_down(node));
1396 			if (title)
1397 				html->title = fz_pool_strdup(ctx, g.pool, title);
1398 		}
1399 		else
1400 		{
1401 			node = fz_xml_find(root, "html");
1402 			node = fz_xml_find_down(node, "head");
1403 			node = fz_xml_find_down(node, "title");
1404 			title = fz_xml_text(fz_xml_down(node));
1405 			if (title)
1406 				html->title = fz_pool_strdup(ctx, g.pool, title);
1407 		}
1408 	}
1409 	fz_always(ctx)
1410 	{
1411 		fz_drop_tree(ctx, g.images, (void(*)(fz_context*,void*))fz_drop_image);
1412 		fz_drop_css(ctx, g.css);
1413 		fz_drop_xml(ctx, xml);
1414 	}
1415 	fz_catch(ctx)
1416 	{
1417 		fz_drop_html(ctx, html);
1418 		fz_rethrow(ctx);
1419 	}
1420 
1421 	return html;
1422 }
1423 
1424 fz_html *
fz_parse_fb2(fz_context * ctx,fz_html_font_set * set,fz_archive * zip,const char * base_uri,fz_buffer * buf,const char * user_css)1425 fz_parse_fb2(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css)
1426 {
1427 	/* parse only as XML */
1428 	return fz_parse_html_imp(ctx, set, zip, base_uri, buf, user_css, 1, 0);
1429 }
1430 
1431 fz_html *
fz_parse_html5(fz_context * ctx,fz_html_font_set * set,fz_archive * zip,const char * base_uri,fz_buffer * buf,const char * user_css)1432 fz_parse_html5(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css)
1433 {
1434 	/* parse only as HTML5 */
1435 	return fz_parse_html_imp(ctx, set, zip, base_uri, buf, user_css, 0, 1);
1436 }
1437 
1438 fz_html *
fz_parse_xhtml(fz_context * ctx,fz_html_font_set * set,fz_archive * zip,const char * base_uri,fz_buffer * buf,const char * user_css)1439 fz_parse_xhtml(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css)
1440 {
1441 	/* try as XML first, fall back to HTML5 */
1442 	return fz_parse_html_imp(ctx, set, zip, base_uri, buf, user_css, 1, 1);
1443 }
1444 
indent(int level)1445 static void indent(int level)
1446 {
1447 	while (level-- > 0)
1448 		putchar('\t');
1449 }
1450 
1451 static void
fz_debug_html_flow(fz_context * ctx,fz_html_flow * flow,int level)1452 fz_debug_html_flow(fz_context *ctx, fz_html_flow *flow, int level)
1453 {
1454 	fz_html_box *sbox = NULL;
1455 	while (flow)
1456 	{
1457 		if (flow->box != sbox) {
1458 			if (sbox) {
1459 				indent(level);
1460 				printf("}\n");
1461 			}
1462 			sbox = flow->box;
1463 			indent(level);
1464 			printf("span em=%g font='%s'", sbox->em, fz_font_name(ctx, sbox->style->font));
1465 			if (fz_font_is_serif(ctx, sbox->style->font))
1466 				printf(" serif");
1467 			else
1468 				printf(" sans");
1469 			if (fz_font_is_monospaced(ctx, sbox->style->font))
1470 				printf(" monospaced");
1471 			if (fz_font_is_bold(ctx, sbox->style->font))
1472 				printf(" bold");
1473 			if (fz_font_is_italic(ctx, sbox->style->font))
1474 				printf(" italic");
1475 			if (sbox->style->small_caps)
1476 				printf(" small-caps");
1477 			printf("\n");
1478 			indent(level);
1479 			printf("{\n");
1480 		}
1481 
1482 		indent(level+1);
1483 		switch (flow->type) {
1484 		case FLOW_WORD: printf("word "); break;
1485 		case FLOW_SPACE: printf("space"); break;
1486 		case FLOW_SBREAK: printf("sbrk "); break;
1487 		case FLOW_SHYPHEN: printf("shy  "); break;
1488 		case FLOW_BREAK: printf("break"); break;
1489 		case FLOW_IMAGE: printf("image"); break;
1490 		case FLOW_ANCHOR: printf("anchor"); break;
1491 		}
1492 		printf(" y=%g x=%g w=%g", flow->y, flow->x, flow->w);
1493 		if (flow->type == FLOW_IMAGE)
1494 			printf(" h=%g", flow->h);
1495 		if (flow->type == FLOW_WORD)
1496 			printf(" text='%s'", flow->content.text);
1497 		printf("\n");
1498 		if (flow->breaks_line) {
1499 			indent(level+1);
1500 			printf("*\n");
1501 		}
1502 
1503 		flow = flow->next;
1504 	}
1505 	indent(level);
1506 	printf("}\n");
1507 }
1508 
1509 static void
fz_debug_html_box(fz_context * ctx,fz_html_box * box,int level)1510 fz_debug_html_box(fz_context *ctx, fz_html_box *box, int level)
1511 {
1512 	while (box)
1513 	{
1514 		indent(level);
1515 		switch (box->type) {
1516 		case BOX_BLOCK: printf("block"); break;
1517 		case BOX_FLOW: printf("flow"); break;
1518 		case BOX_INLINE: printf("inline"); break;
1519 		case BOX_TABLE: printf("table"); break;
1520 		case BOX_TABLE_ROW: printf("table-row"); break;
1521 		case BOX_TABLE_CELL: printf("table-cell"); break;
1522 		}
1523 
1524 		printf(" em=%g x=%g y=%g w=%g b=%g\n", box->em, box->x, box->y, box->w, box->b);
1525 
1526 		indent(level);
1527 		printf("{\n");
1528 		if (box->type == BOX_BLOCK) {
1529 			indent(level+1);
1530 			printf("margin=%g %g %g %g\n", box->margin[0], box->margin[1], box->margin[2], box->margin[3]);
1531 		}
1532 		if (box->is_first_flow) {
1533 			indent(level+1);
1534 			printf("is-first-flow\n");
1535 		}
1536 		if (box->list_item) {
1537 			indent(level+1);
1538 			printf("list=%d\n", box->list_item);
1539 		}
1540 		if (box->id) {
1541 			indent(level+1);
1542 			printf("id=%s\n", box->id);
1543 		}
1544 		if (box->href) {
1545 			indent(level+1);
1546 			printf("href=%s\n", box->href);
1547 		}
1548 
1549 		if (box->down)
1550 			fz_debug_html_box(ctx, box->down, level + 1);
1551 		if (box->flow_head)
1552 			fz_debug_html_flow(ctx, box->flow_head, level + 1);
1553 
1554 		indent(level);
1555 		printf("}\n");
1556 
1557 		box = box->next;
1558 	}
1559 }
1560 
1561 void
fz_debug_html(fz_context * ctx,fz_html_box * box)1562 fz_debug_html(fz_context *ctx, fz_html_box *box)
1563 {
1564 	fz_debug_html_box(ctx, box, 0);
1565 }
1566 
1567 static size_t
fz_html_size(fz_context * ctx,fz_html * html)1568 fz_html_size(fz_context *ctx, fz_html *html)
1569 {
1570 	return html ? fz_pool_size(ctx, html->pool) : 0;
1571 }
1572 
1573 /* Magic to make html storable. */
1574 typedef struct {
1575 	int refs;
1576 	void *doc;
1577 	int chapter_num;
1578 } fz_html_key;
1579 
1580 static int
fz_make_hash_html_key(fz_context * ctx,fz_store_hash * hash,void * key_)1581 fz_make_hash_html_key(fz_context *ctx, fz_store_hash *hash, void *key_)
1582 {
1583 	fz_html_key *key = (fz_html_key *)key_;
1584 	hash->u.pi.ptr = key->doc;
1585 	hash->u.pi.i = key->chapter_num;
1586 	return 1;
1587 }
1588 
1589 static void *
fz_keep_html_key(fz_context * ctx,void * key_)1590 fz_keep_html_key(fz_context *ctx, void *key_)
1591 {
1592 	fz_html_key *key = (fz_html_key *)key_;
1593 	return fz_keep_imp(ctx, key, &key->refs);
1594 }
1595 
1596 static void
fz_drop_html_key(fz_context * ctx,void * key_)1597 fz_drop_html_key(fz_context *ctx, void *key_)
1598 {
1599 	fz_html_key *key = (fz_html_key *)key_;
1600 	if (fz_drop_imp(ctx, key, &key->refs))
1601 	{
1602 		fz_free(ctx, key);
1603 	}
1604 }
1605 
1606 static int
fz_cmp_html_key(fz_context * ctx,void * k0_,void * k1_)1607 fz_cmp_html_key(fz_context *ctx, void *k0_, void *k1_)
1608 {
1609 	fz_html_key *k0 = (fz_html_key *)k0_;
1610 	fz_html_key *k1 = (fz_html_key *)k1_;
1611 	return k0->doc == k1->doc && k0->chapter_num == k1->chapter_num;
1612 }
1613 
1614 static void
fz_format_html_key(fz_context * ctx,char * s,size_t n,void * key_)1615 fz_format_html_key(fz_context *ctx, char *s, size_t n, void *key_)
1616 {
1617 	fz_html_key *key = (fz_html_key *)key_;
1618 	fz_snprintf(s, n, "(html doc=%p, ch=%d)", key->doc, key->chapter_num);
1619 }
1620 
1621 static const fz_store_type fz_html_store_type =
1622 {
1623 	"fz_html",
1624 	fz_make_hash_html_key,
1625 	fz_keep_html_key,
1626 	fz_drop_html_key,
1627 	fz_cmp_html_key,
1628 	fz_format_html_key,
1629 	NULL
1630 };
1631 
fz_store_html(fz_context * ctx,fz_html * html,void * doc,int chapter)1632 fz_html *fz_store_html(fz_context *ctx, fz_html *html, void *doc, int chapter)
1633 {
1634 	fz_html_key *key = NULL;
1635 	fz_html *other_html;
1636 
1637 	/* Stick the parsed html in the store */
1638 	fz_var(key);
1639 
1640 	fz_try(ctx)
1641 	{
1642 		key = fz_malloc_struct(ctx, fz_html_key);
1643 		key->refs = 1;
1644 		key->doc = doc;
1645 		key->chapter_num = chapter;
1646 		other_html = fz_store_item(ctx, key, html, fz_html_size(ctx, html), &fz_html_store_type);
1647 		if (other_html)
1648 		{
1649 			fz_drop_html(ctx, html);
1650 			html = other_html;
1651 		}
1652 	}
1653 	fz_always(ctx)
1654 		fz_drop_html_key(ctx, key);
1655 	fz_catch(ctx)
1656 	{
1657 		/* Do nothing */
1658 	}
1659 
1660 	return html;
1661 }
1662 
fz_find_html(fz_context * ctx,void * doc,int chapter)1663 fz_html *fz_find_html(fz_context *ctx, void *doc, int chapter)
1664 {
1665 	fz_html_key key;
1666 
1667 	key.refs = 1;
1668 	key.doc = doc;
1669 	key.chapter_num = chapter;
1670 	return fz_find_item(ctx, &fz_drop_html_imp, &key, &fz_html_store_type);
1671 }
1672 
1673 static int
html_filter_store(fz_context * ctx,void * doc,void * key_)1674 html_filter_store(fz_context *ctx, void *doc, void *key_)
1675 {
1676 	fz_html_key *key = (fz_html_key *)key_;
1677 
1678 	return (doc == key->doc);
1679 }
1680 
fz_purge_stored_html(fz_context * ctx,void * doc)1681 void fz_purge_stored_html(fz_context *ctx, void *doc)
1682 {
1683 	fz_filter_store(ctx, html_filter_store, doc, &fz_html_store_type);
1684 }
1685