1 #include "mupdf/fitz.h"
2
3 #define SUBSCRIPT_OFFSET 0.2f
4 #define SUPERSCRIPT_OFFSET -0.2f
5
6 #include <ft2build.h>
7 #include FT_FREETYPE_H
8
9 /* HTML output (visual formatting with preserved layout) */
10
11 static int
detect_super_script(fz_stext_line * line,fz_stext_char * ch)12 detect_super_script(fz_stext_line *line, fz_stext_char *ch)
13 {
14 if (line->wmode == 0 && line->dir.x == 1 && line->dir.y == 0)
15 return ch->origin.y < line->first_char->origin.y - ch->size * 0.1f;
16 return 0;
17 }
18
19 static const char *
font_full_name(fz_context * ctx,fz_font * font)20 font_full_name(fz_context *ctx, fz_font *font)
21 {
22 const char *name = fz_font_name(ctx, font);
23 const char *s = strchr(name, '+');
24 return s ? s + 1 : name;
25 }
26
27 static void
font_family_name(fz_context * ctx,fz_font * font,char * buf,int size,int is_mono,int is_serif)28 font_family_name(fz_context *ctx, fz_font *font, char *buf, int size, int is_mono, int is_serif)
29 {
30 const char *name = font_full_name(ctx, font);
31 char *s;
32 fz_strlcpy(buf, name, size);
33 s = strrchr(buf, '-');
34 if (s)
35 *s = 0;
36 if (is_mono)
37 fz_strlcat(buf, ",monospace", size);
38 else
39 fz_strlcat(buf, is_serif ? ",serif" : ",sans-serif", size);
40 }
41
42 static void
fz_print_style_begin_html(fz_context * ctx,fz_output * out,fz_font * font,float size,int sup,int color)43 fz_print_style_begin_html(fz_context *ctx, fz_output *out, fz_font *font, float size, int sup, int color)
44 {
45 char family[80];
46
47 int is_bold = fz_font_is_bold(ctx, font);
48 int is_italic = fz_font_is_italic(ctx, font);
49 int is_serif = fz_font_is_serif(ctx, font);
50 int is_mono = fz_font_is_monospaced(ctx, font);
51
52 font_family_name(ctx, font, family, sizeof family, is_mono, is_serif);
53
54 if (sup) fz_write_string(ctx, out, "<sup>");
55 if (is_mono) fz_write_string(ctx, out, "<tt>");
56 if (is_bold) fz_write_string(ctx, out, "<b>");
57 if (is_italic) fz_write_string(ctx, out, "<i>");
58 fz_write_printf(ctx, out, "<span style=\"font-family:%s;font-size:%gpt", family, size);
59 if (color != 0)
60 fz_write_printf(ctx, out, ";color:#%06x", color);
61 fz_write_printf(ctx, out, "\">");
62 }
63
64 static void
fz_print_style_end_html(fz_context * ctx,fz_output * out,fz_font * font,float size,int sup)65 fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_font *font, float size, int sup)
66 {
67 int is_mono = fz_font_is_monospaced(ctx, font);
68 int is_bold = fz_font_is_bold(ctx,font);
69 int is_italic = fz_font_is_italic(ctx, font);
70
71 fz_write_string(ctx, out, "</span>");
72 if (is_italic) fz_write_string(ctx, out, "</i>");
73 if (is_bold) fz_write_string(ctx, out, "</b>");
74 if (is_mono) fz_write_string(ctx, out, "</tt>");
75 if (sup) fz_write_string(ctx, out, "</sup>");
76 }
77
78 static void
fz_print_stext_image_as_html(fz_context * ctx,fz_output * out,fz_stext_block * block)79 fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
80 {
81 int x = block->bbox.x0;
82 int y = block->bbox.y0;
83 int w = block->bbox.x1 - block->bbox.x0;
84 int h = block->bbox.y1 - block->bbox.y0;
85
86 fz_write_printf(ctx, out, "<img style=\"position:absolute;top:%dpt;left:%dpt;width:%dpt;height:%dpt\" src=\"", y, x, w, h);
87 fz_write_image_as_data_uri(ctx, out, block->u.i.image);
88 fz_write_string(ctx, out, "\">\n");
89 }
90
91 void
fz_print_stext_block_as_html(fz_context * ctx,fz_output * out,fz_stext_block * block)92 fz_print_stext_block_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
93 {
94 fz_stext_line *line;
95 fz_stext_char *ch;
96 int x, y;
97
98 fz_font *font = NULL;
99 float size = 0;
100 int sup = 0;
101 int color = 0;
102
103 for (line = block->u.t.first_line; line; line = line->next)
104 {
105 x = line->bbox.x0;
106 y = line->bbox.y0;
107
108 fz_write_printf(ctx, out, "<p style=\"position:absolute;white-space:pre;margin:0;padding:0;top:%dpt;left:%dpt\">", y, x);
109 font = NULL;
110
111 for (ch = line->first_char; ch; ch = ch->next)
112 {
113 int ch_sup = detect_super_script(line, ch);
114 if (ch->font != font || ch->size != size || ch_sup != sup || ch->color != color)
115 {
116 if (font)
117 fz_print_style_end_html(ctx, out, font, size, sup);
118 font = ch->font;
119 size = ch->size;
120 color = ch->color;
121 sup = ch_sup;
122 fz_print_style_begin_html(ctx, out, font, size, sup, color);
123 }
124
125 switch (ch->c)
126 {
127 default:
128 if (ch->c >= 32 && ch->c <= 127)
129 fz_write_byte(ctx, out, ch->c);
130 else
131 fz_write_printf(ctx, out, "&#x%x;", ch->c);
132 break;
133 case '<': fz_write_string(ctx, out, "<"); break;
134 case '>': fz_write_string(ctx, out, ">"); break;
135 case '&': fz_write_string(ctx, out, "&"); break;
136 case '"': fz_write_string(ctx, out, """); break;
137 case '\'': fz_write_string(ctx, out, "'"); break;
138 }
139 }
140
141 if (font)
142 fz_print_style_end_html(ctx, out, font, size, sup);
143
144 fz_write_string(ctx, out, "</p>\n");
145 }
146 }
147
148 void
fz_print_stext_page_as_html(fz_context * ctx,fz_output * out,fz_stext_page * page,int id)149 fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page, int id)
150 {
151 fz_stext_block *block;
152
153 int w = page->mediabox.x1 - page->mediabox.x0;
154 int h = page->mediabox.y1 - page->mediabox.y0;
155
156 fz_write_printf(ctx, out, "<div id=\"page%d\" style=\"position:relative;width:%dpt;height:%dpt;background-color:white\">\n", id, w, h);
157
158 for (block = page->first_block; block; block = block->next)
159 {
160 if (block->type == FZ_STEXT_BLOCK_IMAGE)
161 fz_print_stext_image_as_html(ctx, out, block);
162 else if (block->type == FZ_STEXT_BLOCK_TEXT)
163 fz_print_stext_block_as_html(ctx, out, block);
164 }
165
166 fz_write_string(ctx, out, "</div>\n");
167 }
168
169 void
fz_print_stext_header_as_html(fz_context * ctx,fz_output * out)170 fz_print_stext_header_as_html(fz_context *ctx, fz_output *out)
171 {
172 fz_write_string(ctx, out, "<!DOCTYPE html>\n");
173 fz_write_string(ctx, out, "<html>\n");
174 fz_write_string(ctx, out, "<head>\n");
175 fz_write_string(ctx, out, "<style>\n");
176 fz_write_string(ctx, out, "body{background-color:gray}\n");
177 fz_write_string(ctx, out, "div{margin:1em auto}\n");
178 fz_write_string(ctx, out, "</style>\n");
179 fz_write_string(ctx, out, "</head>\n");
180 fz_write_string(ctx, out, "<body>\n");
181 }
182
183 void
fz_print_stext_trailer_as_html(fz_context * ctx,fz_output * out)184 fz_print_stext_trailer_as_html(fz_context *ctx, fz_output *out)
185 {
186 fz_write_string(ctx, out, "</body>\n");
187 fz_write_string(ctx, out, "</html>\n");
188 }
189
190 /* XHTML output (semantic, little layout, suitable for reflow) */
191
192 static void
fz_print_stext_image_as_xhtml(fz_context * ctx,fz_output * out,fz_stext_block * block)193 fz_print_stext_image_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
194 {
195 int w = block->bbox.x1 - block->bbox.x0;
196 int h = block->bbox.y1 - block->bbox.y0;
197
198 fz_write_printf(ctx, out, "<p><img width=\"%d\" height=\"%d\" src=\"", w, h);
199 fz_write_image_as_data_uri(ctx, out, block->u.i.image);
200 fz_write_string(ctx, out, "\"/></p>\n");
201 }
202
203 static void
fz_print_style_begin_xhtml(fz_context * ctx,fz_output * out,fz_font * font,int sup)204 fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_font *font, int sup)
205 {
206 int is_mono = fz_font_is_monospaced(ctx, font);
207 int is_bold = fz_font_is_bold(ctx, font);
208 int is_italic = fz_font_is_italic(ctx, font);
209
210 if (sup)
211 fz_write_string(ctx, out, "<sup>");
212 if (is_mono)
213 fz_write_string(ctx, out, "<tt>");
214 if (is_bold)
215 fz_write_string(ctx, out, "<b>");
216 if (is_italic)
217 fz_write_string(ctx, out, "<i>");
218 }
219
220 static void
fz_print_style_end_xhtml(fz_context * ctx,fz_output * out,fz_font * font,int sup)221 fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_font *font, int sup)
222 {
223 int is_mono = fz_font_is_monospaced(ctx, font);
224 int is_bold = fz_font_is_bold(ctx, font);
225 int is_italic = fz_font_is_italic(ctx, font);
226
227 if (is_italic)
228 fz_write_string(ctx, out, "</i>");
229 if (is_bold)
230 fz_write_string(ctx, out, "</b>");
231 if (is_mono)
232 fz_write_string(ctx, out, "</tt>");
233 if (sup)
234 fz_write_string(ctx, out, "</sup>");
235 }
236
avg_font_size_of_line(fz_stext_char * ch)237 static float avg_font_size_of_line(fz_stext_char *ch)
238 {
239 float size = 0;
240 int n = 0;
241 if (!ch)
242 return 0;
243 while (ch)
244 {
245 size += ch->size;
246 ++n;
247 ch = ch->next;
248 }
249 return size / n;
250 }
251
tag_from_font_size(float size)252 static const char *tag_from_font_size(float size)
253 {
254 if (size >= 20) return "h1";
255 if (size >= 15) return "h2";
256 if (size >= 12) return "h3";
257 return "p";
258 }
259
fz_print_stext_block_as_xhtml(fz_context * ctx,fz_output * out,fz_stext_block * block)260 static void fz_print_stext_block_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
261 {
262 fz_stext_line *line;
263 fz_stext_char *ch;
264
265 fz_font *font = NULL;
266 int sup = 0;
267 int sp = 1;
268 const char *tag = NULL;
269 const char *new_tag;
270
271 for (line = block->u.t.first_line; line; line = line->next)
272 {
273 new_tag = tag_from_font_size(avg_font_size_of_line(line->first_char));
274 if (tag != new_tag)
275 {
276 if (tag)
277 {
278 if (font)
279 fz_print_style_end_xhtml(ctx, out, font, sup);
280 fz_write_printf(ctx, out, "</%s>", tag);
281 }
282 tag = new_tag;
283 fz_write_printf(ctx, out, "<%s>", tag);
284 if (font)
285 fz_print_style_begin_xhtml(ctx, out, font, sup);
286 }
287
288 if (!sp)
289 fz_write_byte(ctx, out, ' ');
290
291 for (ch = line->first_char; ch; ch = ch->next)
292 {
293 int ch_sup = detect_super_script(line, ch);
294 if (ch->font != font || ch_sup != sup)
295 {
296 if (font)
297 fz_print_style_end_xhtml(ctx, out, font, sup);
298 font = ch->font;
299 sup = ch_sup;
300 fz_print_style_begin_xhtml(ctx, out, font, sup);
301 }
302
303 sp = (ch->c == ' ');
304 switch (ch->c)
305 {
306 default:
307 if (ch->c >= 32 && ch->c <= 127)
308 fz_write_byte(ctx, out, ch->c);
309 else
310 fz_write_printf(ctx, out, "&#x%x;", ch->c);
311 break;
312 case '<': fz_write_string(ctx, out, "<"); break;
313 case '>': fz_write_string(ctx, out, ">"); break;
314 case '&': fz_write_string(ctx, out, "&"); break;
315 case '"': fz_write_string(ctx, out, """); break;
316 case '\'': fz_write_string(ctx, out, "'"); break;
317 }
318 }
319 }
320
321 if (font)
322 fz_print_style_end_xhtml(ctx, out, font, sup);
323 fz_write_printf(ctx, out, "</%s>\n", tag);
324 }
325
326 void
fz_print_stext_page_as_xhtml(fz_context * ctx,fz_output * out,fz_stext_page * page,int id)327 fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id)
328 {
329 fz_stext_block *block;
330
331 fz_write_printf(ctx, out, "<div id=\"page%d\">\n", id);
332
333 for (block = page->first_block; block; block = block->next)
334 {
335 if (block->type == FZ_STEXT_BLOCK_IMAGE)
336 fz_print_stext_image_as_xhtml(ctx, out, block);
337 else if (block->type == FZ_STEXT_BLOCK_TEXT)
338 fz_print_stext_block_as_xhtml(ctx, out, block);
339 }
340
341 fz_write_string(ctx, out, "</div>\n");
342 }
343
344 void
fz_print_stext_header_as_xhtml(fz_context * ctx,fz_output * out)345 fz_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out)
346 {
347 fz_write_string(ctx, out, "<?xml version=\"1.0\"?>\n");
348 fz_write_string(ctx, out, "<!DOCTYPE html");
349 fz_write_string(ctx, out, " PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"");
350 fz_write_string(ctx, out, " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n");
351 fz_write_string(ctx, out, "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n");
352 fz_write_string(ctx, out, "<head>\n");
353 fz_write_string(ctx, out, "<style>\n");
354 fz_write_string(ctx, out, "p{white-space:pre-wrap}\n");
355 fz_write_string(ctx, out, "</style>\n");
356 fz_write_string(ctx, out, "</head>\n");
357 fz_write_string(ctx, out, "<body>\n");
358 }
359
360 void
fz_print_stext_trailer_as_xhtml(fz_context * ctx,fz_output * out)361 fz_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out)
362 {
363 fz_write_string(ctx, out, "</body>\n");
364 fz_write_string(ctx, out, "</html>\n");
365 }
366
367 /* Detailed XML dump of the entire structured text data */
368
369 void
fz_print_stext_page_as_xml(fz_context * ctx,fz_output * out,fz_stext_page * page,int id)370 fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id)
371 {
372 fz_stext_block *block;
373 fz_stext_line *line;
374 fz_stext_char *ch;
375
376 fz_write_printf(ctx, out, "<page id=\"page%d\" width=\"%g\" height=\"%g\">\n", id,
377 page->mediabox.x1 - page->mediabox.x0,
378 page->mediabox.y1 - page->mediabox.y0);
379
380 for (block = page->first_block; block; block = block->next)
381 {
382 switch (block->type)
383 {
384 case FZ_STEXT_BLOCK_TEXT:
385 fz_write_printf(ctx, out, "<block bbox=\"%g %g %g %g\">\n",
386 block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
387 for (line = block->u.t.first_line; line; line = line->next)
388 {
389 fz_font *font = NULL;
390 float size = 0;
391 const char *name = NULL;
392
393 fz_write_printf(ctx, out, "<line bbox=\"%g %g %g %g\" wmode=\"%d\" dir=\"%g %g\">\n",
394 line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1,
395 line->wmode,
396 line->dir.x, line->dir.y);
397
398 for (ch = line->first_char; ch; ch = ch->next)
399 {
400 if (ch->font != font || ch->size != size)
401 {
402 if (font)
403 fz_write_string(ctx, out, "</font>\n");
404 font = ch->font;
405 size = ch->size;
406 name = font_full_name(ctx, font);
407 fz_write_printf(ctx, out, "<font name=\"%s\" size=\"%g\">\n", name, size);
408 }
409 fz_write_printf(ctx, out, "<char quad=\"%g %g %g %g %g %g %g %g\" x=\"%g\" y=\"%g\" color=\"#%06x\" c=\"",
410 ch->quad.ul.x, ch->quad.ul.y,
411 ch->quad.ur.x, ch->quad.ur.y,
412 ch->quad.ll.x, ch->quad.ll.y,
413 ch->quad.lr.x, ch->quad.lr.y,
414 ch->origin.x, ch->origin.y,
415 ch->color);
416 switch (ch->c)
417 {
418 case '<': fz_write_string(ctx, out, "<"); break;
419 case '>': fz_write_string(ctx, out, ">"); break;
420 case '&': fz_write_string(ctx, out, "&"); break;
421 case '"': fz_write_string(ctx, out, """); break;
422 case '\'': fz_write_string(ctx, out, "'"); break;
423 default:
424 if (ch->c >= 32 && ch->c <= 127)
425 fz_write_printf(ctx, out, "%c", ch->c);
426 else
427 fz_write_printf(ctx, out, "&#x%x;", ch->c);
428 break;
429 }
430 fz_write_string(ctx, out, "\"/>\n");
431 }
432
433 if (font)
434 fz_write_string(ctx, out, "</font>\n");
435
436 fz_write_string(ctx, out, "</line>\n");
437 }
438 fz_write_string(ctx, out, "</block>\n");
439 break;
440
441 case FZ_STEXT_BLOCK_IMAGE:
442 fz_write_printf(ctx, out, "<image bbox=\"%g %g %g %g\" />\n",
443 block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
444 break;
445 }
446 }
447 fz_write_string(ctx, out, "</page>\n");
448 }
449
450 /* JSON dump */
451
452 void
fz_print_stext_page_as_json(fz_context * ctx,fz_output * out,fz_stext_page * page,float scale)453 fz_print_stext_page_as_json(fz_context *ctx, fz_output *out, fz_stext_page *page, float scale)
454 {
455 fz_stext_block *block;
456 fz_stext_line *line;
457 fz_stext_char *ch;
458
459 fz_write_printf(ctx, out, "{%q:[", "blocks");
460
461 for (block = page->first_block; block; block = block->next)
462 {
463 if (block != page->first_block)
464 fz_write_string(ctx, out, ",");
465 switch (block->type)
466 {
467 case FZ_STEXT_BLOCK_TEXT:
468 fz_write_printf(ctx, out, "{%q:%q,", "type", "text");
469 fz_write_printf(ctx, out, "%q:{", "bbox");
470 fz_write_printf(ctx, out, "%q:%d,", "x", (int)(block->bbox.x0 * scale));
471 fz_write_printf(ctx, out, "%q:%d,", "y", (int)(block->bbox.y0 * scale));
472 fz_write_printf(ctx, out, "%q:%d,", "w", (int)((block->bbox.x1 - block->bbox.x0) * scale));
473 fz_write_printf(ctx, out, "%q:%d},", "h", (int)((block->bbox.y1 - block->bbox.y0) * scale));
474 fz_write_printf(ctx, out, "%q:[", "lines");
475
476 for (line = block->u.t.first_line; line; line = line->next)
477 {
478 if (line != block->u.t.first_line)
479 fz_write_string(ctx, out, ",");
480 fz_write_printf(ctx, out, "{%q:%d,", "wmode", line->wmode);
481 fz_write_printf(ctx, out, "%q:{", "bbox");
482 fz_write_printf(ctx, out, "%q:%d,", "x", (int)(line->bbox.x0 * scale));
483 fz_write_printf(ctx, out, "%q:%d,", "y", (int)(line->bbox.y0 * scale));
484 fz_write_printf(ctx, out, "%q:%d,", "w", (int)((line->bbox.x1 - line->bbox.x0) * scale));
485 fz_write_printf(ctx, out, "%q:%d},", "h", (int)((line->bbox.y1 - line->bbox.y0) * scale));
486
487 /* Since we force preserve-spans, the first char has the style for the entire line. */
488 if (line->first_char)
489 {
490 fz_font *font = line->first_char->font;
491 char *font_family = "sans-serif";
492 char *font_weight = "normal";
493 char *font_style = "normal";
494 if (fz_font_is_monospaced(ctx, font)) font_family = "monospace";
495 else if (fz_font_is_serif(ctx, font)) font_family = "serif";
496 if (fz_font_is_bold(ctx, font)) font_weight = "bold";
497 if (fz_font_is_italic(ctx, font)) font_style = "italic";
498 fz_write_printf(ctx, out, "%q:{", "font");
499 fz_write_printf(ctx, out, "%q:%q,", "name", fz_font_name(ctx, font));
500 fz_write_printf(ctx, out, "%q:%q,", "family", font_family);
501 fz_write_printf(ctx, out, "%q:%q,", "weight", font_weight);
502 fz_write_printf(ctx, out, "%q:%q,", "style", font_style);
503 fz_write_printf(ctx, out, "%q:%d},", "size", (int)(line->first_char->size * scale));
504 fz_write_printf(ctx, out, "%q:%d,", "x", (int)(line->first_char->origin.x * scale));
505 fz_write_printf(ctx, out, "%q:%d,", "y", (int)(line->first_char->origin.y * scale));
506 }
507
508 fz_write_printf(ctx, out, "%q:\"", "text");
509 for (ch = line->first_char; ch; ch = ch->next)
510 {
511 if (ch->c == '"' || ch->c == '\\')
512 fz_write_printf(ctx, out, "\\%c", ch->c);
513 else if (ch->c < 32)
514 fz_write_printf(ctx, out, "\\u%04x", ch->c);
515 else
516 fz_write_printf(ctx, out, "%C", ch->c);
517 }
518 fz_write_printf(ctx, out, "\"}");
519 }
520 fz_write_string(ctx, out, "]}");
521 break;
522
523 case FZ_STEXT_BLOCK_IMAGE:
524 fz_write_printf(ctx, out, "{%q:%q,", "type", "image");
525 fz_write_printf(ctx, out, "%q:{", "bbox");
526 fz_write_printf(ctx, out, "%q:%d,", "x", (int)(block->bbox.x0 * scale));
527 fz_write_printf(ctx, out, "%q:%d,", "y", (int)(block->bbox.y0 * scale));
528 fz_write_printf(ctx, out, "%q:%d,", "w", (int)((block->bbox.x1 - block->bbox.x0) * scale));
529 fz_write_printf(ctx, out, "%q:%d}}", "h", (int)((block->bbox.y1 - block->bbox.y0) * scale));
530 break;
531 }
532 }
533 fz_write_string(ctx, out, "]}");
534 }
535
536 /* Plain text */
537
538 void
fz_print_stext_page_as_text(fz_context * ctx,fz_output * out,fz_stext_page * page)539 fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page)
540 {
541 fz_stext_block *block;
542 fz_stext_line *line;
543 fz_stext_char *ch;
544 char utf[10];
545 int i, n;
546
547 for (block = page->first_block; block; block = block->next)
548 {
549 if (block->type == FZ_STEXT_BLOCK_TEXT)
550 {
551 for (line = block->u.t.first_line; line; line = line->next)
552 {
553 for (ch = line->first_char; ch; ch = ch->next)
554 {
555 n = fz_runetochar(utf, ch->c);
556 for (i = 0; i < n; i++)
557 fz_write_byte(ctx, out, utf[i]);
558 }
559 fz_write_string(ctx, out, "\n");
560 }
561 fz_write_string(ctx, out, "\n");
562 }
563 }
564 }
565
566 /* Text output writer */
567
568 enum {
569 FZ_FORMAT_TEXT,
570 FZ_FORMAT_HTML,
571 FZ_FORMAT_XHTML,
572 FZ_FORMAT_STEXT_XML,
573 FZ_FORMAT_STEXT_JSON,
574 };
575
576 typedef struct
577 {
578 fz_document_writer super;
579 int format;
580 int number;
581 fz_stext_options opts;
582 fz_stext_page *page;
583 fz_output *out;
584 } fz_text_writer;
585
586 static fz_device *
text_begin_page(fz_context * ctx,fz_document_writer * wri_,fz_rect mediabox)587 text_begin_page(fz_context *ctx, fz_document_writer *wri_, fz_rect mediabox)
588 {
589 fz_text_writer *wri = (fz_text_writer*)wri_;
590
591 if (wri->page)
592 {
593 fz_drop_stext_page(ctx, wri->page);
594 wri->page = NULL;
595 }
596
597 wri->number++;
598
599 wri->page = fz_new_stext_page(ctx, mediabox);
600 return fz_new_stext_device(ctx, wri->page, &wri->opts);
601 }
602
603 static void
text_end_page(fz_context * ctx,fz_document_writer * wri_,fz_device * dev)604 text_end_page(fz_context *ctx, fz_document_writer *wri_, fz_device *dev)
605 {
606 fz_text_writer *wri = (fz_text_writer*)wri_;
607
608 fz_try(ctx)
609 {
610 fz_close_device(ctx, dev);
611 switch (wri->format)
612 {
613 default:
614 case FZ_FORMAT_TEXT:
615 fz_print_stext_page_as_text(ctx, wri->out, wri->page);
616 break;
617 case FZ_FORMAT_HTML:
618 fz_print_stext_page_as_html(ctx, wri->out, wri->page, wri->number);
619 break;
620 case FZ_FORMAT_XHTML:
621 fz_print_stext_page_as_xhtml(ctx, wri->out, wri->page, wri->number);
622 break;
623 case FZ_FORMAT_STEXT_XML:
624 fz_print_stext_page_as_xml(ctx, wri->out, wri->page, wri->number);
625 break;
626 case FZ_FORMAT_STEXT_JSON:
627 if (wri->number > 1)
628 fz_write_string(ctx, wri->out, ",");
629 fz_print_stext_page_as_json(ctx, wri->out, wri->page, 1);
630 break;
631 }
632 }
633 fz_always(ctx)
634 {
635 fz_drop_device(ctx, dev);
636 fz_drop_stext_page(ctx, wri->page);
637 wri->page = NULL;
638 }
639 fz_catch(ctx)
640 fz_rethrow(ctx);
641 }
642
643 static void
text_close_writer(fz_context * ctx,fz_document_writer * wri_)644 text_close_writer(fz_context *ctx, fz_document_writer *wri_)
645 {
646 fz_text_writer *wri = (fz_text_writer*)wri_;
647 switch (wri->format)
648 {
649 case FZ_FORMAT_HTML:
650 fz_print_stext_trailer_as_html(ctx, wri->out);
651 break;
652 case FZ_FORMAT_XHTML:
653 fz_print_stext_trailer_as_xhtml(ctx, wri->out);
654 break;
655 case FZ_FORMAT_STEXT_XML:
656 fz_write_string(ctx, wri->out, "</document>\n");
657 break;
658 case FZ_FORMAT_STEXT_JSON:
659 fz_write_string(ctx, wri->out, "]\n");
660 break;
661 }
662 fz_close_output(ctx, wri->out);
663 }
664
665 static void
text_drop_writer(fz_context * ctx,fz_document_writer * wri_)666 text_drop_writer(fz_context *ctx, fz_document_writer *wri_)
667 {
668 fz_text_writer *wri = (fz_text_writer*)wri_;
669 fz_drop_stext_page(ctx, wri->page);
670 fz_drop_output(ctx, wri->out);
671 }
672
673 fz_document_writer *
fz_new_text_writer_with_output(fz_context * ctx,const char * format,fz_output * out,const char * options)674 fz_new_text_writer_with_output(fz_context *ctx, const char *format, fz_output *out, const char *options)
675 {
676 fz_text_writer *wri;
677
678 wri = fz_new_derived_document_writer(ctx, fz_text_writer, text_begin_page, text_end_page, text_close_writer, text_drop_writer);
679 fz_try(ctx)
680 {
681 fz_parse_stext_options(ctx, &wri->opts, options);
682
683 wri->format = FZ_FORMAT_TEXT;
684 if (!strcmp(format, "text"))
685 wri->format = FZ_FORMAT_TEXT;
686 else if (!strcmp(format, "html"))
687 wri->format = FZ_FORMAT_HTML;
688 else if (!strcmp(format, "xhtml"))
689 wri->format = FZ_FORMAT_XHTML;
690 else if (!strcmp(format, "stext"))
691 wri->format = FZ_FORMAT_STEXT_XML;
692 else if (!strcmp(format, "stext.xml"))
693 wri->format = FZ_FORMAT_STEXT_XML;
694 else if (!strcmp(format, "stext.json"))
695 {
696 wri->format = FZ_FORMAT_STEXT_JSON;
697 wri->opts.flags |= FZ_STEXT_PRESERVE_SPANS;
698 }
699
700 wri->out = out;
701
702 switch (wri->format)
703 {
704 case FZ_FORMAT_HTML:
705 fz_print_stext_header_as_html(ctx, wri->out);
706 break;
707 case FZ_FORMAT_XHTML:
708 fz_print_stext_header_as_xhtml(ctx, wri->out);
709 break;
710 case FZ_FORMAT_STEXT_XML:
711 fz_write_string(ctx, wri->out, "<?xml version=\"1.0\"?>\n");
712 fz_write_string(ctx, wri->out, "<document>\n");
713 break;
714 case FZ_FORMAT_STEXT_JSON:
715 fz_write_string(ctx, wri->out, "[");
716 break;
717 }
718 }
719 fz_catch(ctx)
720 {
721 fz_free(ctx, wri);
722 fz_rethrow(ctx);
723 }
724
725 return (fz_document_writer*)wri;
726 }
727
728 fz_document_writer *
fz_new_text_writer(fz_context * ctx,const char * format,const char * path,const char * options)729 fz_new_text_writer(fz_context *ctx, const char *format, const char *path, const char *options)
730 {
731 fz_output *out = fz_new_output_with_path(ctx, path ? path : "out.txt", 0);
732 fz_document_writer *wri = NULL;
733 fz_try(ctx)
734 wri = fz_new_text_writer_with_output(ctx, format, out, options);
735 fz_catch(ctx)
736 {
737 fz_drop_output(ctx, out);
738 fz_rethrow(ctx);
739 }
740 return wri;
741 }
742