1 #include "mupdf/fitz.h"
2 
3 #define SUBSCRIPT_OFFSET 0.2f
4 #define SUPERSCRIPT_OFFSET -0.2f
5 
6 #include <ft2build.h>
7 #include FT_FREETYPE_H
8 
9 /* HTML output (visual formatting with preserved layout) */
10 
11 static int
detect_super_script(fz_stext_line * line,fz_stext_char * ch)12 detect_super_script(fz_stext_line *line, fz_stext_char *ch)
13 {
14 	if (line->wmode == 0 && line->dir.x == 1 && line->dir.y == 0)
15 		return ch->origin.y < line->first_char->origin.y - ch->size * 0.1f;
16 	return 0;
17 }
18 
19 static const char *
font_full_name(fz_context * ctx,fz_font * font)20 font_full_name(fz_context *ctx, fz_font *font)
21 {
22 	const char *name = fz_font_name(ctx, font);
23 	const char *s = strchr(name, '+');
24 	return s ? s + 1 : name;
25 }
26 
27 static void
font_family_name(fz_context * ctx,fz_font * font,char * buf,int size,int is_mono,int is_serif)28 font_family_name(fz_context *ctx, fz_font *font, char *buf, int size, int is_mono, int is_serif)
29 {
30 	const char *name = font_full_name(ctx, font);
31 	char *s;
32 	fz_strlcpy(buf, name, size);
33 	s = strrchr(buf, '-');
34 	if (s)
35 		*s = 0;
36 	if (is_mono)
37 		fz_strlcat(buf, ",monospace", size);
38 	else
39 		fz_strlcat(buf, is_serif ? ",serif" : ",sans-serif", size);
40 }
41 
42 static void
fz_print_style_begin_html(fz_context * ctx,fz_output * out,fz_font * font,float size,int sup,int color)43 fz_print_style_begin_html(fz_context *ctx, fz_output *out, fz_font *font, float size, int sup, int color)
44 {
45 	char family[80];
46 
47 	int is_bold = fz_font_is_bold(ctx, font);
48 	int is_italic = fz_font_is_italic(ctx, font);
49 	int is_serif = fz_font_is_serif(ctx, font);
50 	int is_mono = fz_font_is_monospaced(ctx, font);
51 
52 	font_family_name(ctx, font, family, sizeof family, is_mono, is_serif);
53 
54 	if (sup) fz_write_string(ctx, out, "<sup>");
55 	if (is_mono) fz_write_string(ctx, out, "<tt>");
56 	if (is_bold) fz_write_string(ctx, out, "<b>");
57 	if (is_italic) fz_write_string(ctx, out, "<i>");
58 	fz_write_printf(ctx, out, "<span style=\"font-family:%s;font-size:%gpt", family, size);
59 	if (color != 0)
60 		fz_write_printf(ctx, out, ";color:#%06x", color);
61 	fz_write_printf(ctx, out, "\">");
62 }
63 
64 static void
fz_print_style_end_html(fz_context * ctx,fz_output * out,fz_font * font,float size,int sup)65 fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_font *font, float size, int sup)
66 {
67 	int is_mono = fz_font_is_monospaced(ctx, font);
68 	int is_bold = fz_font_is_bold(ctx,font);
69 	int is_italic = fz_font_is_italic(ctx, font);
70 
71 	fz_write_string(ctx, out, "</span>");
72 	if (is_italic) fz_write_string(ctx, out, "</i>");
73 	if (is_bold) fz_write_string(ctx, out, "</b>");
74 	if (is_mono) fz_write_string(ctx, out, "</tt>");
75 	if (sup) fz_write_string(ctx, out, "</sup>");
76 }
77 
78 static void
fz_print_stext_image_as_html(fz_context * ctx,fz_output * out,fz_stext_block * block)79 fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
80 {
81 	int x = block->bbox.x0;
82 	int y = block->bbox.y0;
83 	int w = block->bbox.x1 - block->bbox.x0;
84 	int h = block->bbox.y1 - block->bbox.y0;
85 
86 	fz_write_printf(ctx, out, "<img style=\"position:absolute;top:%dpt;left:%dpt;width:%dpt;height:%dpt\" src=\"", y, x, w, h);
87 	fz_write_image_as_data_uri(ctx, out, block->u.i.image);
88 	fz_write_string(ctx, out, "\">\n");
89 }
90 
91 void
fz_print_stext_block_as_html(fz_context * ctx,fz_output * out,fz_stext_block * block)92 fz_print_stext_block_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
93 {
94 	fz_stext_line *line;
95 	fz_stext_char *ch;
96 	int x, y;
97 
98 	fz_font *font = NULL;
99 	float size = 0;
100 	int sup = 0;
101 	int color = 0;
102 
103 	for (line = block->u.t.first_line; line; line = line->next)
104 	{
105 		x = line->bbox.x0;
106 		y = line->bbox.y0;
107 
108 		fz_write_printf(ctx, out, "<p style=\"position:absolute;white-space:pre;margin:0;padding:0;top:%dpt;left:%dpt\">", y, x);
109 		font = NULL;
110 
111 		for (ch = line->first_char; ch; ch = ch->next)
112 		{
113 			int ch_sup = detect_super_script(line, ch);
114 			if (ch->font != font || ch->size != size || ch_sup != sup || ch->color != color)
115 			{
116 				if (font)
117 					fz_print_style_end_html(ctx, out, font, size, sup);
118 				font = ch->font;
119 				size = ch->size;
120 				color = ch->color;
121 				sup = ch_sup;
122 				fz_print_style_begin_html(ctx, out, font, size, sup, color);
123 			}
124 
125 			switch (ch->c)
126 			{
127 			default:
128 				if (ch->c >= 32 && ch->c <= 127)
129 					fz_write_byte(ctx, out, ch->c);
130 				else
131 					fz_write_printf(ctx, out, "&#x%x;", ch->c);
132 				break;
133 			case '<': fz_write_string(ctx, out, "&lt;"); break;
134 			case '>': fz_write_string(ctx, out, "&gt;"); break;
135 			case '&': fz_write_string(ctx, out, "&amp;"); break;
136 			case '"': fz_write_string(ctx, out, "&quot;"); break;
137 			case '\'': fz_write_string(ctx, out, "&apos;"); break;
138 			}
139 		}
140 
141 		if (font)
142 			fz_print_style_end_html(ctx, out, font, size, sup);
143 
144 		fz_write_string(ctx, out, "</p>\n");
145 	}
146 }
147 
148 void
fz_print_stext_page_as_html(fz_context * ctx,fz_output * out,fz_stext_page * page,int id)149 fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page, int id)
150 {
151 	fz_stext_block *block;
152 
153 	int w = page->mediabox.x1 - page->mediabox.x0;
154 	int h = page->mediabox.y1 - page->mediabox.y0;
155 
156 	fz_write_printf(ctx, out, "<div id=\"page%d\" style=\"position:relative;width:%dpt;height:%dpt;background-color:white\">\n", id, w, h);
157 
158 	for (block = page->first_block; block; block = block->next)
159 	{
160 		if (block->type == FZ_STEXT_BLOCK_IMAGE)
161 			fz_print_stext_image_as_html(ctx, out, block);
162 		else if (block->type == FZ_STEXT_BLOCK_TEXT)
163 			fz_print_stext_block_as_html(ctx, out, block);
164 	}
165 
166 	fz_write_string(ctx, out, "</div>\n");
167 }
168 
169 void
fz_print_stext_header_as_html(fz_context * ctx,fz_output * out)170 fz_print_stext_header_as_html(fz_context *ctx, fz_output *out)
171 {
172 	fz_write_string(ctx, out, "<!DOCTYPE html>\n");
173 	fz_write_string(ctx, out, "<html>\n");
174 	fz_write_string(ctx, out, "<head>\n");
175 	fz_write_string(ctx, out, "<style>\n");
176 	fz_write_string(ctx, out, "body{background-color:gray}\n");
177 	fz_write_string(ctx, out, "div{margin:1em auto}\n");
178 	fz_write_string(ctx, out, "</style>\n");
179 	fz_write_string(ctx, out, "</head>\n");
180 	fz_write_string(ctx, out, "<body>\n");
181 }
182 
183 void
fz_print_stext_trailer_as_html(fz_context * ctx,fz_output * out)184 fz_print_stext_trailer_as_html(fz_context *ctx, fz_output *out)
185 {
186 	fz_write_string(ctx, out, "</body>\n");
187 	fz_write_string(ctx, out, "</html>\n");
188 }
189 
190 /* XHTML output (semantic, little layout, suitable for reflow) */
191 
192 static void
fz_print_stext_image_as_xhtml(fz_context * ctx,fz_output * out,fz_stext_block * block)193 fz_print_stext_image_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
194 {
195 	int w = block->bbox.x1 - block->bbox.x0;
196 	int h = block->bbox.y1 - block->bbox.y0;
197 
198 	fz_write_printf(ctx, out, "<p><img width=\"%d\" height=\"%d\" src=\"", w, h);
199 	fz_write_image_as_data_uri(ctx, out, block->u.i.image);
200 	fz_write_string(ctx, out, "\"/></p>\n");
201 }
202 
203 static void
fz_print_style_begin_xhtml(fz_context * ctx,fz_output * out,fz_font * font,int sup)204 fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_font *font, int sup)
205 {
206 	int is_mono = fz_font_is_monospaced(ctx, font);
207 	int is_bold = fz_font_is_bold(ctx, font);
208 	int is_italic = fz_font_is_italic(ctx, font);
209 
210 	if (sup)
211 		fz_write_string(ctx, out, "<sup>");
212 	if (is_mono)
213 		fz_write_string(ctx, out, "<tt>");
214 	if (is_bold)
215 		fz_write_string(ctx, out, "<b>");
216 	if (is_italic)
217 		fz_write_string(ctx, out, "<i>");
218 }
219 
220 static void
fz_print_style_end_xhtml(fz_context * ctx,fz_output * out,fz_font * font,int sup)221 fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_font *font, int sup)
222 {
223 	int is_mono = fz_font_is_monospaced(ctx, font);
224 	int is_bold = fz_font_is_bold(ctx, font);
225 	int is_italic = fz_font_is_italic(ctx, font);
226 
227 	if (is_italic)
228 		fz_write_string(ctx, out, "</i>");
229 	if (is_bold)
230 		fz_write_string(ctx, out, "</b>");
231 	if (is_mono)
232 		fz_write_string(ctx, out, "</tt>");
233 	if (sup)
234 		fz_write_string(ctx, out, "</sup>");
235 }
236 
avg_font_size_of_line(fz_stext_char * ch)237 static float avg_font_size_of_line(fz_stext_char *ch)
238 {
239 	float size = 0;
240 	int n = 0;
241 	if (!ch)
242 		return 0;
243 	while (ch)
244 	{
245 		size += ch->size;
246 		++n;
247 		ch = ch->next;
248 	}
249 	return size / n;
250 }
251 
tag_from_font_size(float size)252 static const char *tag_from_font_size(float size)
253 {
254 	if (size >= 20) return "h1";
255 	if (size >= 15) return "h2";
256 	if (size >= 12) return "h3";
257 	return "p";
258 }
259 
fz_print_stext_block_as_xhtml(fz_context * ctx,fz_output * out,fz_stext_block * block)260 static void fz_print_stext_block_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
261 {
262 	fz_stext_line *line;
263 	fz_stext_char *ch;
264 
265 	fz_font *font = NULL;
266 	int sup = 0;
267 	int sp = 1;
268 	const char *tag = NULL;
269 	const char *new_tag;
270 
271 	for (line = block->u.t.first_line; line; line = line->next)
272 	{
273 		new_tag = tag_from_font_size(avg_font_size_of_line(line->first_char));
274 		if (tag != new_tag)
275 		{
276 			if (tag)
277 			{
278 				if (font)
279 					fz_print_style_end_xhtml(ctx, out, font, sup);
280 				fz_write_printf(ctx, out, "</%s>", tag);
281 			}
282 			tag = new_tag;
283 			fz_write_printf(ctx, out, "<%s>", tag);
284 			if (font)
285 				fz_print_style_begin_xhtml(ctx, out, font, sup);
286 		}
287 
288 		if (!sp)
289 			fz_write_byte(ctx, out, ' ');
290 
291 		for (ch = line->first_char; ch; ch = ch->next)
292 		{
293 			int ch_sup = detect_super_script(line, ch);
294 			if (ch->font != font || ch_sup != sup)
295 			{
296 				if (font)
297 					fz_print_style_end_xhtml(ctx, out, font, sup);
298 				font = ch->font;
299 				sup = ch_sup;
300 				fz_print_style_begin_xhtml(ctx, out, font, sup);
301 			}
302 
303 			sp = (ch->c == ' ');
304 			switch (ch->c)
305 			{
306 			default:
307 				if (ch->c >= 32 && ch->c <= 127)
308 					fz_write_byte(ctx, out, ch->c);
309 				else
310 					fz_write_printf(ctx, out, "&#x%x;", ch->c);
311 				break;
312 			case '<': fz_write_string(ctx, out, "&lt;"); break;
313 			case '>': fz_write_string(ctx, out, "&gt;"); break;
314 			case '&': fz_write_string(ctx, out, "&amp;"); break;
315 			case '"': fz_write_string(ctx, out, "&quot;"); break;
316 			case '\'': fz_write_string(ctx, out, "&apos;"); break;
317 			}
318 		}
319 	}
320 
321 	if (font)
322 		fz_print_style_end_xhtml(ctx, out, font, sup);
323 	fz_write_printf(ctx, out, "</%s>\n", tag);
324 }
325 
326 void
fz_print_stext_page_as_xhtml(fz_context * ctx,fz_output * out,fz_stext_page * page,int id)327 fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id)
328 {
329 	fz_stext_block *block;
330 
331 	fz_write_printf(ctx, out, "<div id=\"page%d\">\n", id);
332 
333 	for (block = page->first_block; block; block = block->next)
334 	{
335 		if (block->type == FZ_STEXT_BLOCK_IMAGE)
336 			fz_print_stext_image_as_xhtml(ctx, out, block);
337 		else if (block->type == FZ_STEXT_BLOCK_TEXT)
338 			fz_print_stext_block_as_xhtml(ctx, out, block);
339 	}
340 
341 	fz_write_string(ctx, out, "</div>\n");
342 }
343 
344 void
fz_print_stext_header_as_xhtml(fz_context * ctx,fz_output * out)345 fz_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out)
346 {
347 	fz_write_string(ctx, out, "<?xml version=\"1.0\"?>\n");
348 	fz_write_string(ctx, out, "<!DOCTYPE html");
349 	fz_write_string(ctx, out, " PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"");
350 	fz_write_string(ctx, out, " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n");
351 	fz_write_string(ctx, out, "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n");
352 	fz_write_string(ctx, out, "<head>\n");
353 	fz_write_string(ctx, out, "<style>\n");
354 	fz_write_string(ctx, out, "p{white-space:pre-wrap}\n");
355 	fz_write_string(ctx, out, "</style>\n");
356 	fz_write_string(ctx, out, "</head>\n");
357 	fz_write_string(ctx, out, "<body>\n");
358 }
359 
360 void
fz_print_stext_trailer_as_xhtml(fz_context * ctx,fz_output * out)361 fz_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out)
362 {
363 	fz_write_string(ctx, out, "</body>\n");
364 	fz_write_string(ctx, out, "</html>\n");
365 }
366 
367 /* Detailed XML dump of the entire structured text data */
368 
369 void
fz_print_stext_page_as_xml(fz_context * ctx,fz_output * out,fz_stext_page * page,int id)370 fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id)
371 {
372 	fz_stext_block *block;
373 	fz_stext_line *line;
374 	fz_stext_char *ch;
375 
376 	fz_write_printf(ctx, out, "<page id=\"page%d\" width=\"%g\" height=\"%g\">\n", id,
377 		page->mediabox.x1 - page->mediabox.x0,
378 		page->mediabox.y1 - page->mediabox.y0);
379 
380 	for (block = page->first_block; block; block = block->next)
381 	{
382 		switch (block->type)
383 		{
384 		case FZ_STEXT_BLOCK_TEXT:
385 			fz_write_printf(ctx, out, "<block bbox=\"%g %g %g %g\">\n",
386 					block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
387 			for (line = block->u.t.first_line; line; line = line->next)
388 			{
389 				fz_font *font = NULL;
390 				float size = 0;
391 				const char *name = NULL;
392 
393 				fz_write_printf(ctx, out, "<line bbox=\"%g %g %g %g\" wmode=\"%d\" dir=\"%g %g\">\n",
394 						line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1,
395 						line->wmode,
396 						line->dir.x, line->dir.y);
397 
398 				for (ch = line->first_char; ch; ch = ch->next)
399 				{
400 					if (ch->font != font || ch->size != size)
401 					{
402 						if (font)
403 							fz_write_string(ctx, out, "</font>\n");
404 						font = ch->font;
405 						size = ch->size;
406 						name = font_full_name(ctx, font);
407 						fz_write_printf(ctx, out, "<font name=\"%s\" size=\"%g\">\n", name, size);
408 					}
409 					fz_write_printf(ctx, out, "<char quad=\"%g %g %g %g %g %g %g %g\" x=\"%g\" y=\"%g\" color=\"#%06x\" c=\"",
410 							ch->quad.ul.x, ch->quad.ul.y,
411 							ch->quad.ur.x, ch->quad.ur.y,
412 							ch->quad.ll.x, ch->quad.ll.y,
413 							ch->quad.lr.x, ch->quad.lr.y,
414 							ch->origin.x, ch->origin.y,
415 							ch->color);
416 					switch (ch->c)
417 					{
418 					case '<': fz_write_string(ctx, out, "&lt;"); break;
419 					case '>': fz_write_string(ctx, out, "&gt;"); break;
420 					case '&': fz_write_string(ctx, out, "&amp;"); break;
421 					case '"': fz_write_string(ctx, out, "&quot;"); break;
422 					case '\'': fz_write_string(ctx, out, "&apos;"); break;
423 					default:
424 						   if (ch->c >= 32 && ch->c <= 127)
425 							   fz_write_printf(ctx, out, "%c", ch->c);
426 						   else
427 							   fz_write_printf(ctx, out, "&#x%x;", ch->c);
428 						   break;
429 					}
430 					fz_write_string(ctx, out, "\"/>\n");
431 				}
432 
433 				if (font)
434 					fz_write_string(ctx, out, "</font>\n");
435 
436 				fz_write_string(ctx, out, "</line>\n");
437 			}
438 			fz_write_string(ctx, out, "</block>\n");
439 			break;
440 
441 		case FZ_STEXT_BLOCK_IMAGE:
442 			fz_write_printf(ctx, out, "<image bbox=\"%g %g %g %g\" />\n",
443 					block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
444 			break;
445 		}
446 	}
447 	fz_write_string(ctx, out, "</page>\n");
448 }
449 
450 /* JSON dump */
451 
452 void
fz_print_stext_page_as_json(fz_context * ctx,fz_output * out,fz_stext_page * page,float scale)453 fz_print_stext_page_as_json(fz_context *ctx, fz_output *out, fz_stext_page *page, float scale)
454 {
455 	fz_stext_block *block;
456 	fz_stext_line *line;
457 	fz_stext_char *ch;
458 
459 	fz_write_printf(ctx, out, "{%q:[", "blocks");
460 
461 	for (block = page->first_block; block; block = block->next)
462 	{
463 		if (block != page->first_block)
464 			fz_write_string(ctx, out, ",");
465 		switch (block->type)
466 		{
467 		case FZ_STEXT_BLOCK_TEXT:
468 			fz_write_printf(ctx, out, "{%q:%q,", "type", "text");
469 			fz_write_printf(ctx, out, "%q:{", "bbox");
470 			fz_write_printf(ctx, out, "%q:%d,", "x", (int)(block->bbox.x0 * scale));
471 			fz_write_printf(ctx, out, "%q:%d,", "y", (int)(block->bbox.y0 * scale));
472 			fz_write_printf(ctx, out, "%q:%d,", "w", (int)((block->bbox.x1 - block->bbox.x0) * scale));
473 			fz_write_printf(ctx, out, "%q:%d},", "h", (int)((block->bbox.y1 - block->bbox.y0) * scale));
474 			fz_write_printf(ctx, out, "%q:[", "lines");
475 
476 			for (line = block->u.t.first_line; line; line = line->next)
477 			{
478 				if (line != block->u.t.first_line)
479 					fz_write_string(ctx, out, ",");
480 				fz_write_printf(ctx, out, "{%q:%d,", "wmode", line->wmode);
481 				fz_write_printf(ctx, out, "%q:{", "bbox");
482 				fz_write_printf(ctx, out, "%q:%d,", "x", (int)(line->bbox.x0 * scale));
483 				fz_write_printf(ctx, out, "%q:%d,", "y", (int)(line->bbox.y0 * scale));
484 				fz_write_printf(ctx, out, "%q:%d,", "w", (int)((line->bbox.x1 - line->bbox.x0) * scale));
485 				fz_write_printf(ctx, out, "%q:%d},", "h", (int)((line->bbox.y1 - line->bbox.y0) * scale));
486 
487 				/* Since we force preserve-spans, the first char has the style for the entire line. */
488 				if (line->first_char)
489 				{
490 					fz_font *font = line->first_char->font;
491 					char *font_family = "sans-serif";
492 					char *font_weight = "normal";
493 					char *font_style = "normal";
494 					if (fz_font_is_monospaced(ctx, font)) font_family = "monospace";
495 					else if (fz_font_is_serif(ctx, font)) font_family = "serif";
496 					if (fz_font_is_bold(ctx, font)) font_weight = "bold";
497 					if (fz_font_is_italic(ctx, font)) font_style = "italic";
498 					fz_write_printf(ctx, out, "%q:{", "font");
499 					fz_write_printf(ctx, out, "%q:%q,", "name", fz_font_name(ctx, font));
500 					fz_write_printf(ctx, out, "%q:%q,", "family", font_family);
501 					fz_write_printf(ctx, out, "%q:%q,", "weight", font_weight);
502 					fz_write_printf(ctx, out, "%q:%q,", "style", font_style);
503 					fz_write_printf(ctx, out, "%q:%d},", "size", (int)(line->first_char->size * scale));
504 					fz_write_printf(ctx, out, "%q:%d,", "x", (int)(line->first_char->origin.x * scale));
505 					fz_write_printf(ctx, out, "%q:%d,", "y", (int)(line->first_char->origin.y * scale));
506 				}
507 
508 				fz_write_printf(ctx, out, "%q:\"", "text");
509 				for (ch = line->first_char; ch; ch = ch->next)
510 				{
511 					if (ch->c == '"' || ch->c == '\\')
512 						fz_write_printf(ctx, out, "\\%c", ch->c);
513 					else if (ch->c < 32)
514 						fz_write_printf(ctx, out, "\\u%04x", ch->c);
515 					else
516 						fz_write_printf(ctx, out, "%C", ch->c);
517 				}
518 				fz_write_printf(ctx, out, "\"}");
519 			}
520 			fz_write_string(ctx, out, "]}");
521 			break;
522 
523 		case FZ_STEXT_BLOCK_IMAGE:
524 			fz_write_printf(ctx, out, "{%q:%q,", "type", "image");
525 			fz_write_printf(ctx, out, "%q:{", "bbox");
526 			fz_write_printf(ctx, out, "%q:%d,", "x", (int)(block->bbox.x0 * scale));
527 			fz_write_printf(ctx, out, "%q:%d,", "y", (int)(block->bbox.y0 * scale));
528 			fz_write_printf(ctx, out, "%q:%d,", "w", (int)((block->bbox.x1 - block->bbox.x0) * scale));
529 			fz_write_printf(ctx, out, "%q:%d}}", "h", (int)((block->bbox.y1 - block->bbox.y0) * scale));
530 			break;
531 		}
532 	}
533 	fz_write_string(ctx, out, "]}");
534 }
535 
536 /* Plain text */
537 
538 void
fz_print_stext_page_as_text(fz_context * ctx,fz_output * out,fz_stext_page * page)539 fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page)
540 {
541 	fz_stext_block *block;
542 	fz_stext_line *line;
543 	fz_stext_char *ch;
544 	char utf[10];
545 	int i, n;
546 
547 	for (block = page->first_block; block; block = block->next)
548 	{
549 		if (block->type == FZ_STEXT_BLOCK_TEXT)
550 		{
551 			for (line = block->u.t.first_line; line; line = line->next)
552 			{
553 				for (ch = line->first_char; ch; ch = ch->next)
554 				{
555 					n = fz_runetochar(utf, ch->c);
556 					for (i = 0; i < n; i++)
557 						fz_write_byte(ctx, out, utf[i]);
558 				}
559 				fz_write_string(ctx, out, "\n");
560 			}
561 			fz_write_string(ctx, out, "\n");
562 		}
563 	}
564 }
565 
566 /* Text output writer */
567 
568 enum {
569 	FZ_FORMAT_TEXT,
570 	FZ_FORMAT_HTML,
571 	FZ_FORMAT_XHTML,
572 	FZ_FORMAT_STEXT_XML,
573 	FZ_FORMAT_STEXT_JSON,
574 };
575 
576 typedef struct
577 {
578 	fz_document_writer super;
579 	int format;
580 	int number;
581 	fz_stext_options opts;
582 	fz_stext_page *page;
583 	fz_output *out;
584 } fz_text_writer;
585 
586 static fz_device *
text_begin_page(fz_context * ctx,fz_document_writer * wri_,fz_rect mediabox)587 text_begin_page(fz_context *ctx, fz_document_writer *wri_, fz_rect mediabox)
588 {
589 	fz_text_writer *wri = (fz_text_writer*)wri_;
590 
591 	if (wri->page)
592 	{
593 		fz_drop_stext_page(ctx, wri->page);
594 		wri->page = NULL;
595 	}
596 
597 	wri->number++;
598 
599 	wri->page = fz_new_stext_page(ctx, mediabox);
600 	return fz_new_stext_device(ctx, wri->page, &wri->opts);
601 }
602 
603 static void
text_end_page(fz_context * ctx,fz_document_writer * wri_,fz_device * dev)604 text_end_page(fz_context *ctx, fz_document_writer *wri_, fz_device *dev)
605 {
606 	fz_text_writer *wri = (fz_text_writer*)wri_;
607 
608 	fz_try(ctx)
609 	{
610 		fz_close_device(ctx, dev);
611 		switch (wri->format)
612 		{
613 		default:
614 		case FZ_FORMAT_TEXT:
615 			fz_print_stext_page_as_text(ctx, wri->out, wri->page);
616 			break;
617 		case FZ_FORMAT_HTML:
618 			fz_print_stext_page_as_html(ctx, wri->out, wri->page, wri->number);
619 			break;
620 		case FZ_FORMAT_XHTML:
621 			fz_print_stext_page_as_xhtml(ctx, wri->out, wri->page, wri->number);
622 			break;
623 		case FZ_FORMAT_STEXT_XML:
624 			fz_print_stext_page_as_xml(ctx, wri->out, wri->page, wri->number);
625 			break;
626 		case FZ_FORMAT_STEXT_JSON:
627 			if (wri->number > 1)
628 				fz_write_string(ctx, wri->out, ",");
629 			fz_print_stext_page_as_json(ctx, wri->out, wri->page, 1);
630 			break;
631 		}
632 	}
633 	fz_always(ctx)
634 	{
635 		fz_drop_device(ctx, dev);
636 		fz_drop_stext_page(ctx, wri->page);
637 		wri->page = NULL;
638 	}
639 	fz_catch(ctx)
640 		fz_rethrow(ctx);
641 }
642 
643 static void
text_close_writer(fz_context * ctx,fz_document_writer * wri_)644 text_close_writer(fz_context *ctx, fz_document_writer *wri_)
645 {
646 	fz_text_writer *wri = (fz_text_writer*)wri_;
647 	switch (wri->format)
648 	{
649 	case FZ_FORMAT_HTML:
650 		fz_print_stext_trailer_as_html(ctx, wri->out);
651 		break;
652 	case FZ_FORMAT_XHTML:
653 		fz_print_stext_trailer_as_xhtml(ctx, wri->out);
654 		break;
655 	case FZ_FORMAT_STEXT_XML:
656 		fz_write_string(ctx, wri->out, "</document>\n");
657 		break;
658 	case FZ_FORMAT_STEXT_JSON:
659 		fz_write_string(ctx, wri->out, "]\n");
660 		break;
661 	}
662 	fz_close_output(ctx, wri->out);
663 }
664 
665 static void
text_drop_writer(fz_context * ctx,fz_document_writer * wri_)666 text_drop_writer(fz_context *ctx, fz_document_writer *wri_)
667 {
668 	fz_text_writer *wri = (fz_text_writer*)wri_;
669 	fz_drop_stext_page(ctx, wri->page);
670 	fz_drop_output(ctx, wri->out);
671 }
672 
673 fz_document_writer *
fz_new_text_writer_with_output(fz_context * ctx,const char * format,fz_output * out,const char * options)674 fz_new_text_writer_with_output(fz_context *ctx, const char *format, fz_output *out, const char *options)
675 {
676 	fz_text_writer *wri;
677 
678 	wri = fz_new_derived_document_writer(ctx, fz_text_writer, text_begin_page, text_end_page, text_close_writer, text_drop_writer);
679 	fz_try(ctx)
680 	{
681 		fz_parse_stext_options(ctx, &wri->opts, options);
682 
683 		wri->format = FZ_FORMAT_TEXT;
684 		if (!strcmp(format, "text"))
685 			wri->format = FZ_FORMAT_TEXT;
686 		else if (!strcmp(format, "html"))
687 			wri->format = FZ_FORMAT_HTML;
688 		else if (!strcmp(format, "xhtml"))
689 			wri->format = FZ_FORMAT_XHTML;
690 		else if (!strcmp(format, "stext"))
691 			wri->format = FZ_FORMAT_STEXT_XML;
692 		else if (!strcmp(format, "stext.xml"))
693 			wri->format = FZ_FORMAT_STEXT_XML;
694 		else if (!strcmp(format, "stext.json"))
695 		{
696 			wri->format = FZ_FORMAT_STEXT_JSON;
697 			wri->opts.flags |= FZ_STEXT_PRESERVE_SPANS;
698 		}
699 
700 		wri->out = out;
701 
702 		switch (wri->format)
703 		{
704 		case FZ_FORMAT_HTML:
705 			fz_print_stext_header_as_html(ctx, wri->out);
706 			break;
707 		case FZ_FORMAT_XHTML:
708 			fz_print_stext_header_as_xhtml(ctx, wri->out);
709 			break;
710 		case FZ_FORMAT_STEXT_XML:
711 			fz_write_string(ctx, wri->out, "<?xml version=\"1.0\"?>\n");
712 			fz_write_string(ctx, wri->out, "<document>\n");
713 			break;
714 		case FZ_FORMAT_STEXT_JSON:
715 			fz_write_string(ctx, wri->out, "[");
716 			break;
717 		}
718 	}
719 	fz_catch(ctx)
720 	{
721 		fz_free(ctx, wri);
722 		fz_rethrow(ctx);
723 	}
724 
725 	return (fz_document_writer*)wri;
726 }
727 
728 fz_document_writer *
fz_new_text_writer(fz_context * ctx,const char * format,const char * path,const char * options)729 fz_new_text_writer(fz_context *ctx, const char *format, const char *path, const char *options)
730 {
731 	fz_output *out = fz_new_output_with_path(ctx, path ? path : "out.txt", 0);
732 	fz_document_writer *wri = NULL;
733 	fz_try(ctx)
734 		wri = fz_new_text_writer_with_output(ctx, format, out, options);
735 	fz_catch(ctx)
736 	{
737 		fz_drop_output(ctx, out);
738 		fz_rethrow(ctx);
739 	}
740 	return wri;
741 }
742