1 #include "mupdf/fitz.h"
2 #include "mupdf/ucdn.h"
3 
4 #include <math.h>
5 #include <float.h>
6 #include <string.h>
7 
8 /* Simple layout structure */
9 
fz_new_layout(fz_context * ctx)10 fz_layout_block *fz_new_layout(fz_context *ctx)
11 {
12 	fz_pool *pool = fz_new_pool(ctx);
13 	fz_layout_block *block;
14 	fz_try(ctx)
15 	{
16 		block = fz_pool_alloc(ctx, pool, sizeof (fz_layout_block));
17 		block->pool = pool;
18 		block->head = NULL;
19 		block->tailp = &block->head;
20 	}
21 	fz_catch(ctx)
22 	{
23 		fz_drop_pool(ctx, pool);
24 		fz_rethrow(ctx);
25 	}
26 	return block;
27 }
28 
fz_drop_layout(fz_context * ctx,fz_layout_block * block)29 void fz_drop_layout(fz_context *ctx, fz_layout_block *block)
30 {
31 	if (block)
32 		fz_drop_pool(ctx, block->pool);
33 }
34 
fz_add_layout_line(fz_context * ctx,fz_layout_block * block,float x,float y,float h,const char * p)35 void fz_add_layout_line(fz_context *ctx, fz_layout_block *block, float x, float y, float h, const char *p)
36 {
37 	fz_layout_line *line = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_line));
38 	line->x = x;
39 	line->y = y;
40 	line->h = h;
41 	line->p = p;
42 	line->text = NULL;
43 	line->next = NULL;
44 	*block->tailp = line;
45 	block->tailp = &line->next;
46 	block->text_tailp = &line->text;
47 }
48 
fz_add_layout_char(fz_context * ctx,fz_layout_block * block,float x,float w,const char * p)49 void fz_add_layout_char(fz_context *ctx, fz_layout_block *block, float x, float w, const char *p)
50 {
51 	fz_layout_char *ch = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_char));
52 	ch->x = x;
53 	ch->w = w;
54 	ch->p = p;
55 	ch->next = NULL;
56 	*block->text_tailp = ch;
57 	block->text_tailp = &ch->next;
58 }
59 
60 /* Extract text into blocks and lines. */
61 
62 #define PARAGRAPH_DIST 1.5f
63 #define SPACE_DIST 0.15f
64 #define SPACE_MAX_DIST 0.8f
65 
66 typedef struct
67 {
68 	fz_device super;
69 	fz_stext_page *page;
70 	fz_point pen, start;
71 	fz_matrix trm;
72 	int new_obj;
73 	int curdir;
74 	int lastchar;
75 	int flags;
76 	int color;
77 	const fz_text *lasttext;
78 } fz_stext_device;
79 
80 const char *fz_stext_options_usage =
81 	"Text output options:\n"
82 	"\tinhibit-spaces: don't add spaces between gaps in the text\n"
83 	"\tpreserve-images: keep images in output\n"
84 	"\tpreserve-ligatures: do not expand ligatures into constituent characters\n"
85 	"\tpreserve-whitespace: do not convert all whitespace into space characters\n"
86 	"\tpreserve-spans: do not merge spans on the same line\n"
87 	"\tdehyphenate: attempt to join up hyphenated words\n"
88 	"\n";
89 
90 fz_stext_page *
fz_new_stext_page(fz_context * ctx,fz_rect mediabox)91 fz_new_stext_page(fz_context *ctx, fz_rect mediabox)
92 {
93 	fz_pool *pool = fz_new_pool(ctx);
94 	fz_stext_page *page = NULL;
95 	fz_try(ctx)
96 	{
97 		page = fz_pool_alloc(ctx, pool, sizeof(*page));
98 		page->pool = pool;
99 		page->mediabox = mediabox;
100 		page->first_block = NULL;
101 		page->last_block = NULL;
102 	}
103 	fz_catch(ctx)
104 	{
105 		fz_drop_pool(ctx, pool);
106 		fz_rethrow(ctx);
107 	}
108 	return page;
109 }
110 
111 void
fz_drop_stext_page(fz_context * ctx,fz_stext_page * page)112 fz_drop_stext_page(fz_context *ctx, fz_stext_page *page)
113 {
114 	if (page)
115 	{
116 		fz_stext_block *block;
117 		for (block = page->first_block; block; block = block->next)
118 			if (block->type == FZ_STEXT_BLOCK_IMAGE)
119 				fz_drop_image(ctx, block->u.i.image);
120 		fz_drop_pool(ctx, page->pool);
121 	}
122 }
123 
124 static fz_stext_block *
add_block_to_page(fz_context * ctx,fz_stext_page * page)125 add_block_to_page(fz_context *ctx, fz_stext_page *page)
126 {
127 	fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block);
128 	block->prev = page->last_block;
129 	if (!page->first_block)
130 		page->first_block = page->last_block = block;
131 	else
132 	{
133 		page->last_block->next = block;
134 		page->last_block = block;
135 	}
136 	return block;
137 }
138 
139 static fz_stext_block *
add_text_block_to_page(fz_context * ctx,fz_stext_page * page)140 add_text_block_to_page(fz_context *ctx, fz_stext_page *page)
141 {
142 	fz_stext_block *block = add_block_to_page(ctx, page);
143 	block->type = FZ_STEXT_BLOCK_TEXT;
144 	return block;
145 }
146 
147 static fz_stext_block *
add_image_block_to_page(fz_context * ctx,fz_stext_page * page,fz_matrix ctm,fz_image * image)148 add_image_block_to_page(fz_context *ctx, fz_stext_page *page, fz_matrix ctm, fz_image *image)
149 {
150 	fz_stext_block *block = add_block_to_page(ctx, page);
151 	block->type = FZ_STEXT_BLOCK_IMAGE;
152 	block->u.i.transform = ctm;
153 	block->u.i.image = fz_keep_image(ctx, image);
154 	block->bbox = fz_transform_rect(fz_unit_rect, ctm);
155 	return block;
156 }
157 
158 static fz_stext_line *
add_line_to_block(fz_context * ctx,fz_stext_page * page,fz_stext_block * block,const fz_point * dir,int wmode)159 add_line_to_block(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, const fz_point *dir, int wmode)
160 {
161 	fz_stext_line *line = fz_pool_alloc(ctx, page->pool, sizeof *block->u.t.first_line);
162 	line->prev = block->u.t.last_line;
163 	if (!block->u.t.first_line)
164 		block->u.t.first_line = block->u.t.last_line = line;
165 	else
166 	{
167 		block->u.t.last_line->next = line;
168 		block->u.t.last_line = line;
169 	}
170 
171 	line->dir = *dir;
172 	line->wmode = wmode;
173 
174 	return line;
175 }
176 
177 static fz_stext_char *
add_char_to_line(fz_context * ctx,fz_stext_page * page,fz_stext_line * line,fz_matrix trm,fz_font * font,float size,int c,fz_point * p,fz_point * q,int color)178 add_char_to_line(fz_context *ctx, fz_stext_page *page, fz_stext_line *line, fz_matrix trm, fz_font *font, float size, int c, fz_point *p, fz_point *q, int color)
179 {
180 	fz_stext_char *ch = fz_pool_alloc(ctx, page->pool, sizeof *line->first_char);
181 	fz_point a, d;
182 
183 	if (!line->first_char)
184 		line->first_char = line->last_char = ch;
185 	else
186 	{
187 		line->last_char->next = ch;
188 		line->last_char = ch;
189 	}
190 
191 	ch->c = c;
192 	ch->color = color;
193 	ch->origin = *p;
194 	ch->size = size;
195 	ch->font = font; /* TODO: keep and drop */
196 
197 	if (line->wmode == 0)
198 	{
199 		a.x = 0;
200 		d.x = 0;
201 		a.y = fz_font_ascender(ctx, font);
202 		d.y = fz_font_descender(ctx, font);
203 	}
204 	else
205 	{
206 		fz_rect bbox = fz_font_bbox(ctx, font);
207 		a.x = bbox.x1;
208 		d.x = bbox.x0;
209 		a.y = 0;
210 		d.y = 0;
211 	}
212 	a = fz_transform_vector(a, trm);
213 	d = fz_transform_vector(d, trm);
214 
215 	ch->quad.ll = fz_make_point(p->x + d.x, p->y + d.y);
216 	ch->quad.ul = fz_make_point(p->x + a.x, p->y + a.y);
217 	ch->quad.lr = fz_make_point(q->x + d.x, q->y + d.y);
218 	ch->quad.ur = fz_make_point(q->x + a.x, q->y + a.y);
219 
220 	return ch;
221 }
222 
223 static void
remove_last_char(fz_context * ctx,fz_stext_line * line)224 remove_last_char(fz_context *ctx, fz_stext_line *line)
225 {
226 	if (line && line->first_char)
227 	{
228 		fz_stext_char *prev = NULL;
229 		fz_stext_char *ch = line->first_char;
230 		while (ch->next)
231 		{
232 			prev = ch;
233 			ch = ch->next;
234 		}
235 		if (prev)
236 		{
237 			/* the characters are pool allocated, so we don't actually leak the removed node */
238 			line->last_char = prev;
239 			line->last_char->next = NULL;
240 		}
241 	}
242 }
243 
244 static int
direction_from_bidi_class(int bidiclass,int curdir)245 direction_from_bidi_class(int bidiclass, int curdir)
246 {
247 	switch (bidiclass)
248 	{
249 	/* strong */
250 	case UCDN_BIDI_CLASS_L: return 1;
251 	case UCDN_BIDI_CLASS_R: return -1;
252 	case UCDN_BIDI_CLASS_AL: return -1;
253 
254 	/* weak */
255 	case UCDN_BIDI_CLASS_EN:
256 	case UCDN_BIDI_CLASS_ES:
257 	case UCDN_BIDI_CLASS_ET:
258 	case UCDN_BIDI_CLASS_AN:
259 	case UCDN_BIDI_CLASS_CS:
260 	case UCDN_BIDI_CLASS_NSM:
261 	case UCDN_BIDI_CLASS_BN:
262 		return curdir;
263 
264 	/* neutral */
265 	case UCDN_BIDI_CLASS_B:
266 	case UCDN_BIDI_CLASS_S:
267 	case UCDN_BIDI_CLASS_WS:
268 	case UCDN_BIDI_CLASS_ON:
269 		return curdir;
270 
271 	/* embedding, override, pop ... we don't support them */
272 	default:
273 		return 0;
274 	}
275 }
276 
is_hyphen(int c)277 static int is_hyphen(int c)
278 {
279 	/* check for: hyphen-minus, soft hyphen, hyphen, and non-breaking hyphen */
280 	return (c == '-' || c == 0xAD || c == 0x2010 || c == 0x2011);
281 }
282 
283 static float
vec_dot(const fz_point * a,const fz_point * b)284 vec_dot(const fz_point *a, const fz_point *b)
285 {
286 	return a->x * b->x + a->y * b->y;
287 }
288 
289 static void
fz_add_stext_char_imp(fz_context * ctx,fz_stext_device * dev,fz_font * font,int c,int glyph,fz_matrix trm,float adv,int wmode,int force_new_line)290 fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix trm, float adv, int wmode, int force_new_line)
291 {
292 	fz_stext_page *page = dev->page;
293 	fz_stext_block *cur_block;
294 	fz_stext_line *cur_line;
295 
296 	int new_para = 0;
297 	int new_line = 1;
298 	int add_space = 0;
299 	fz_point dir, ndir, p, q;
300 	float size;
301 	fz_point delta;
302 	float spacing = 0;
303 	float base_offset = 0;
304 	int rtl = 0;
305 
306 	dev->curdir = direction_from_bidi_class(ucdn_get_bidi_class(c), dev->curdir);
307 
308 	/* dir = direction vector for motion. ndir = normalised(dir) */
309 	if (wmode == 0)
310 	{
311 		dir.x = 1;
312 		dir.y = 0;
313 	}
314 	else
315 	{
316 		dir.x = 0;
317 		dir.y = -1;
318 	}
319 	dir = fz_transform_vector(dir, trm);
320 	ndir = fz_normalize_vector(dir);
321 
322 	size = fz_matrix_expansion(trm);
323 
324 	/* We need to identify where glyphs 'start' (p) and 'stop' (q).
325 	 * Each glyph holds its 'start' position, and the next glyph in the
326 	 * span (or span->max if there is no next glyph) holds its 'end'
327 	 * position.
328 	 *
329 	 * For both horizontal and vertical motion, trm->{e,f} gives the
330 	 * origin (usually the bottom left) of the glyph.
331 	 *
332 	 * In horizontal mode:
333 	 *   + p is bottom left.
334 	 *   + q is the bottom right
335 	 * In vertical mode:
336 	 *   + p is top left (where it advanced from)
337 	 *   + q is bottom left
338 	 */
339 	if (wmode == 0)
340 	{
341 		p.x = trm.e;
342 		p.y = trm.f;
343 		q.x = trm.e + adv * dir.x;
344 		q.y = trm.f + adv * dir.y;
345 	}
346 	else
347 	{
348 		p.x = trm.e - adv * dir.x;
349 		p.y = trm.f - adv * dir.y;
350 		q.x = trm.e;
351 		q.y = trm.f;
352 	}
353 
354 	/* Find current position to enter new text. */
355 	cur_block = page->last_block;
356 	if (cur_block && cur_block->type != FZ_STEXT_BLOCK_TEXT)
357 		cur_block = NULL;
358 	cur_line = cur_block ? cur_block->u.t.last_line : NULL;
359 
360 	if (cur_line && glyph < 0)
361 	{
362 		/* Don't advance pen or break lines for no-glyph characters in a cluster */
363 		add_char_to_line(ctx, page, cur_line, trm, font, size, c, &dev->pen, &dev->pen, dev->color);
364 		dev->lastchar = c;
365 		return;
366 	}
367 
368 	if (cur_line == NULL || cur_line->wmode != wmode || vec_dot(&ndir, &cur_line->dir) < 0.999f)
369 	{
370 		/* If the matrix has changed rotation, or the wmode is different (or if we don't have a line at all),
371 		 * then we can't append to the current block/line. */
372 		new_para = 1;
373 		new_line = 1;
374 	}
375 	else
376 	{
377 		/* Detect fake bold where text is printed twice in the same place. */
378 		delta.x = fabsf(q.x - dev->pen.x);
379 		delta.y = fabsf(q.y - dev->pen.y);
380 		if (delta.x < FLT_EPSILON && delta.y < FLT_EPSILON && c == dev->lastchar)
381 			return;
382 
383 		/* Calculate how far we've moved since the last character. */
384 		delta.x = p.x - dev->pen.x;
385 		delta.y = p.y - dev->pen.y;
386 
387 		/* The transform has not changed, so we know we're in the same
388 		 * direction. Calculate 2 distances; how far off the previous
389 		 * baseline we are, together with how far along the baseline
390 		 * we are from the expected position. */
391 		spacing = ndir.x * delta.x + ndir.y * delta.y;
392 		base_offset = -ndir.y * delta.x + ndir.x * delta.y;
393 
394 		/* Only a small amount off the baseline - we'll take this */
395 		if (fabsf(base_offset) < size * 0.8f)
396 		{
397 			/* LTR or neutral character */
398 			if (dev->curdir >= 0)
399 			{
400 				if (fabsf(spacing) < size * SPACE_DIST)
401 				{
402 					/* Motion is in line and small enough to ignore. */
403 					new_line = 0;
404 				}
405 				else if (fabsf(spacing) > size * SPACE_MAX_DIST)
406 				{
407 					/* Motion is in line and large enough to warrant splitting to a new line */
408 					new_line = 1;
409 				}
410 				else if (spacing < 0)
411 				{
412 					/* Motion is backward in line! Ignore this odd spacing. */
413 					new_line = 0;
414 				}
415 				else
416 				{
417 					/* Motion is forward in line and large enough to warrant us adding a space. */
418 					if (dev->lastchar != ' ' && wmode == 0)
419 						add_space = 1;
420 					new_line = 0;
421 				}
422 			}
423 
424 			/* RTL character -- disable space character and column detection heuristics */
425 			else
426 			{
427 				new_line = 0;
428 				if (spacing > size * SPACE_DIST || spacing < 0)
429 					rtl = 0; /* backward (or big jump to 'right' side) means logical order */
430 				else
431 					rtl = 1; /* visual order, we need to reverse in a post process pass */
432 			}
433 		}
434 
435 		/* Enough for a new line, but not enough for a new paragraph */
436 		else if (fabsf(base_offset) <= size * PARAGRAPH_DIST)
437 		{
438 			/* Check indent to spot text-indent style paragraphs */
439 			if (wmode == 0 && cur_line && dev->new_obj)
440 				if (fabsf(p.x - dev->start.x) > size * 0.5f)
441 					new_para = 1;
442 			new_line = 1;
443 		}
444 
445 		/* Way off the baseline - open a new paragraph */
446 		else
447 		{
448 			new_para = 1;
449 			new_line = 1;
450 		}
451 	}
452 
453 	/* Start a new block (but only at the beginning of a text object) */
454 	if (new_para || !cur_block)
455 	{
456 		cur_block = add_text_block_to_page(ctx, page);
457 		cur_line = cur_block->u.t.last_line;
458 	}
459 
460 	if (new_line && (dev->flags & FZ_STEXT_DEHYPHENATE) && is_hyphen(dev->lastchar))
461 	{
462 		remove_last_char(ctx, cur_line);
463 		new_line = 0;
464 	}
465 
466 	/* Start a new line */
467 	if (new_line || !cur_line || force_new_line)
468 	{
469 		cur_line = add_line_to_block(ctx, page, cur_block, &ndir, wmode);
470 		dev->start = p;
471 	}
472 
473 	/* Add synthetic space */
474 	if (add_space && !(dev->flags & FZ_STEXT_INHIBIT_SPACES))
475 		add_char_to_line(ctx, page, cur_line, trm, font, size, ' ', &dev->pen, &p, dev->color);
476 
477 	add_char_to_line(ctx, page, cur_line, trm, font, size, c, &p, &q, dev->color);
478 	dev->lastchar = c;
479 	dev->pen = q;
480 
481 	dev->new_obj = 0;
482 	dev->trm = trm;
483 }
484 
485 static void
fz_add_stext_char(fz_context * ctx,fz_stext_device * dev,fz_font * font,int c,int glyph,fz_matrix trm,float adv,int wmode,int force_new_line)486 fz_add_stext_char(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix trm, float adv, int wmode, int force_new_line)
487 {
488 	/* ignore when one unicode character maps to multiple glyphs */
489 	if (c == -1)
490 		return;
491 
492 	if (!(dev->flags & FZ_STEXT_PRESERVE_LIGATURES))
493 	{
494 		switch (c)
495 		{
496 		case 0xFB00: /* ff */
497 			fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, force_new_line);
498 			fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, 0);
499 			return;
500 		case 0xFB01: /* fi */
501 			fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, force_new_line);
502 			fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode, 0);
503 			return;
504 		case 0xFB02: /* fl */
505 			fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, force_new_line);
506 			fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode, 0);
507 			return;
508 		case 0xFB03: /* ffi */
509 			fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, force_new_line);
510 			fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, 0);
511 			fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode, 0);
512 			return;
513 		case 0xFB04: /* ffl */
514 			fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, force_new_line);
515 			fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, 0);
516 			fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode, 0);
517 			return;
518 		case 0xFB05: /* long st */
519 		case 0xFB06: /* st */
520 			fz_add_stext_char_imp(ctx, dev, font, 's', glyph, trm, adv, wmode, force_new_line);
521 			fz_add_stext_char_imp(ctx, dev, font, 't', -1, trm, 0, wmode, 0);
522 			return;
523 		}
524 	}
525 
526 	if (!(dev->flags & FZ_STEXT_PRESERVE_WHITESPACE))
527 	{
528 		switch (c)
529 		{
530 		case 0x0009: /* tab */
531 		case 0x0020: /* space */
532 		case 0x00A0: /* no-break space */
533 		case 0x1680: /* ogham space mark */
534 		case 0x180E: /* mongolian vowel separator */
535 		case 0x2000: /* en quad */
536 		case 0x2001: /* em quad */
537 		case 0x2002: /* en space */
538 		case 0x2003: /* em space */
539 		case 0x2004: /* three-per-em space */
540 		case 0x2005: /* four-per-em space */
541 		case 0x2006: /* six-per-em space */
542 		case 0x2007: /* figure space */
543 		case 0x2008: /* punctuation space */
544 		case 0x2009: /* thin space */
545 		case 0x200A: /* hair space */
546 		case 0x202F: /* narrow no-break space */
547 		case 0x205F: /* medium mathematical space */
548 		case 0x3000: /* ideographic space */
549 			c = ' ';
550 		}
551 	}
552 
553 	fz_add_stext_char_imp(ctx, dev, font, c, glyph, trm, adv, wmode, force_new_line);
554 }
555 
556 static void
fz_stext_extract(fz_context * ctx,fz_stext_device * dev,fz_text_span * span,fz_matrix ctm)557 fz_stext_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm)
558 {
559 	fz_font *font = span->font;
560 	fz_matrix tm = span->trm;
561 	fz_matrix trm;
562 	float adv;
563 	int i;
564 
565 	if (span->len == 0)
566 		return;
567 
568 	for (i = 0; i < span->len; i++)
569 	{
570 		/* Calculate new pen location and delta */
571 		tm.e = span->items[i].x;
572 		tm.f = span->items[i].y;
573 		trm = fz_concat(tm, ctm);
574 
575 		/* Calculate bounding box and new pen position based on font metrics */
576 		if (span->items[i].gid >= 0)
577 			adv = fz_advance_glyph(ctx, font, span->items[i].gid, span->wmode);
578 		else
579 			adv = 0;
580 
581 		fz_add_stext_char(ctx, dev, font,
582 			span->items[i].ucs,
583 			span->items[i].gid,
584 			trm,
585 			adv,
586 			span->wmode,
587 			(i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS));
588 	}
589 }
590 
hexrgb_from_color(fz_context * ctx,fz_colorspace * colorspace,const float * color)591 static int hexrgb_from_color(fz_context *ctx, fz_colorspace *colorspace, const float *color)
592 {
593 	float rgb[3];
594 	fz_convert_color(ctx, colorspace, color, fz_device_rgb(ctx), rgb, NULL, fz_default_color_params);
595 	return
596 		(fz_clampi(rgb[0] * 255, 0, 255) << 16) |
597 		(fz_clampi(rgb[1] * 255, 0, 255) << 8) |
598 		(fz_clampi(rgb[2] * 255, 0, 255));
599 }
600 
601 static void
fz_stext_fill_text(fz_context * ctx,fz_device * dev,const fz_text * text,fz_matrix ctm,fz_colorspace * colorspace,const float * color,float alpha,fz_color_params color_params)602 fz_stext_fill_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm,
603 	fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
604 {
605 	fz_stext_device *tdev = (fz_stext_device*)dev;
606 	fz_text_span *span;
607 	if (text == tdev->lasttext)
608 		return;
609 	tdev->color = hexrgb_from_color(ctx, colorspace, color);
610 	tdev->new_obj = 1;
611 	for (span = text->head; span; span = span->next)
612 		fz_stext_extract(ctx, tdev, span, ctm);
613 	fz_drop_text(ctx, tdev->lasttext);
614 	tdev->lasttext = fz_keep_text(ctx, text);
615 }
616 
617 static void
fz_stext_stroke_text(fz_context * ctx,fz_device * dev,const fz_text * text,const fz_stroke_state * stroke,fz_matrix ctm,fz_colorspace * colorspace,const float * color,float alpha,fz_color_params color_params)618 fz_stext_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm,
619 	fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
620 {
621 	fz_stext_device *tdev = (fz_stext_device*)dev;
622 	fz_text_span *span;
623 	if (text == tdev->lasttext)
624 		return;
625 	tdev->color = hexrgb_from_color(ctx, colorspace, color);
626 	tdev->new_obj = 1;
627 	for (span = text->head; span; span = span->next)
628 		fz_stext_extract(ctx, tdev, span, ctm);
629 	fz_drop_text(ctx, tdev->lasttext);
630 	tdev->lasttext = fz_keep_text(ctx, text);
631 }
632 
633 static void
fz_stext_clip_text(fz_context * ctx,fz_device * dev,const fz_text * text,fz_matrix ctm,fz_rect scissor)634 fz_stext_clip_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, fz_rect scissor)
635 {
636 	fz_stext_device *tdev = (fz_stext_device*)dev;
637 	fz_text_span *span;
638 	if (text == tdev->lasttext)
639 		return;
640 	tdev->color = 0;
641 	tdev->new_obj = 1;
642 	for (span = text->head; span; span = span->next)
643 		fz_stext_extract(ctx, tdev, span, ctm);
644 	fz_drop_text(ctx, tdev->lasttext);
645 	tdev->lasttext = fz_keep_text(ctx, text);
646 }
647 
648 static void
fz_stext_clip_stroke_text(fz_context * ctx,fz_device * dev,const fz_text * text,const fz_stroke_state * stroke,fz_matrix ctm,fz_rect scissor)649 fz_stext_clip_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor)
650 {
651 	fz_stext_device *tdev = (fz_stext_device*)dev;
652 	fz_text_span *span;
653 	if (text == tdev->lasttext)
654 		return;
655 	tdev->color = 0;
656 	tdev->new_obj = 1;
657 	for (span = text->head; span; span = span->next)
658 		fz_stext_extract(ctx, tdev, span, ctm);
659 	fz_drop_text(ctx, tdev->lasttext);
660 	tdev->lasttext = fz_keep_text(ctx, text);
661 }
662 
663 static void
fz_stext_ignore_text(fz_context * ctx,fz_device * dev,const fz_text * text,fz_matrix ctm)664 fz_stext_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm)
665 {
666 	fz_stext_device *tdev = (fz_stext_device*)dev;
667 	fz_text_span *span;
668 	if (text == tdev->lasttext)
669 		return;
670 	tdev->color = 0;
671 	tdev->new_obj = 1;
672 	for (span = text->head; span; span = span->next)
673 		fz_stext_extract(ctx, tdev, span, ctm);
674 	fz_drop_text(ctx, tdev->lasttext);
675 	tdev->lasttext = fz_keep_text(ctx, text);
676 }
677 
678 /* Images and shadings */
679 
680 static void
fz_stext_fill_image(fz_context * ctx,fz_device * dev,fz_image * img,fz_matrix ctm,float alpha,fz_color_params color_params)681 fz_stext_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, float alpha, fz_color_params color_params)
682 {
683 	fz_stext_device *tdev = (fz_stext_device*)dev;
684 
685 	/* If the alpha is less than 50% then it's probably a watermark or effect or something. Skip it. */
686 	if (alpha < 0.5f)
687 		return;
688 
689 	add_image_block_to_page(ctx, tdev->page, ctm, img);
690 }
691 
692 static void
fz_stext_fill_image_mask(fz_context * ctx,fz_device * dev,fz_image * img,fz_matrix ctm,fz_colorspace * cspace,const float * color,float alpha,fz_color_params color_params)693 fz_stext_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm,
694 		fz_colorspace *cspace, const float *color, float alpha, fz_color_params color_params)
695 {
696 	fz_stext_fill_image(ctx, dev, img, ctm, alpha, color_params);
697 }
698 
699 static fz_image *
fz_new_image_from_shade(fz_context * ctx,fz_shade * shade,fz_matrix * in_out_ctm,fz_color_params color_params,fz_rect scissor)700 fz_new_image_from_shade(fz_context *ctx, fz_shade *shade, fz_matrix *in_out_ctm, fz_color_params color_params, fz_rect scissor)
701 {
702 	fz_matrix ctm = *in_out_ctm;
703 	fz_pixmap *pix;
704 	fz_image *img = NULL;
705 	fz_rect bounds;
706 	fz_irect bbox;
707 
708 	bounds = fz_bound_shade(ctx, shade, ctm);
709 	bounds = fz_intersect_rect(bounds, scissor);
710 	bbox = fz_irect_from_rect(bounds);
711 
712 	pix = fz_new_pixmap_with_bbox(ctx, fz_device_rgb(ctx), bbox, NULL, !shade->use_background);
713 	fz_try(ctx)
714 	{
715 		if (shade->use_background)
716 			fz_fill_pixmap_with_color(ctx, pix, shade->colorspace, shade->background, color_params);
717 		else
718 			fz_clear_pixmap(ctx, pix);
719 		fz_paint_shade(ctx, shade, NULL, ctm, pix, color_params, bbox, NULL);
720 		img = fz_new_image_from_pixmap(ctx, pix, NULL);
721 	}
722 	fz_always(ctx)
723 		fz_drop_pixmap(ctx, pix);
724 	fz_catch(ctx)
725 		fz_rethrow(ctx);
726 
727 	in_out_ctm->a = pix->w;
728 	in_out_ctm->b = 0;
729 	in_out_ctm->c = 0;
730 	in_out_ctm->d = pix->h;
731 	in_out_ctm->e = pix->x;
732 	in_out_ctm->f = pix->y;
733 	return img;
734 }
735 
736 static void
fz_stext_fill_shade(fz_context * ctx,fz_device * dev,fz_shade * shade,fz_matrix ctm,float alpha,fz_color_params color_params)737 fz_stext_fill_shade(fz_context *ctx, fz_device *dev, fz_shade *shade, fz_matrix ctm, float alpha, fz_color_params color_params)
738 {
739 	fz_matrix local_ctm = ctm;
740 	fz_rect scissor = fz_device_current_scissor(ctx, dev);
741 	fz_image *image = fz_new_image_from_shade(ctx, shade, &local_ctm, color_params, scissor);
742 	fz_try(ctx)
743 		fz_stext_fill_image(ctx, dev, image, local_ctm, alpha, color_params);
744 	fz_always(ctx)
745 		fz_drop_image(ctx, image);
746 	fz_catch(ctx)
747 		fz_rethrow(ctx);
748 }
749 
750 static void
fz_stext_close_device(fz_context * ctx,fz_device * dev)751 fz_stext_close_device(fz_context *ctx, fz_device *dev)
752 {
753 	fz_stext_device *tdev = (fz_stext_device*)dev;
754 	fz_stext_page *page = tdev->page;
755 	fz_stext_block *block;
756 	fz_stext_line *line;
757 	fz_stext_char *ch;
758 
759 	for (block = page->first_block; block; block = block->next)
760 	{
761 		if (block->type != FZ_STEXT_BLOCK_TEXT)
762 			continue;
763 		for (line = block->u.t.first_line; line; line = line->next)
764 		{
765 			for (ch = line->first_char; ch; ch = ch->next)
766 			{
767 				fz_rect ch_box = fz_rect_from_quad(ch->quad);
768 				if (ch == line->first_char)
769 					line->bbox = ch_box;
770 				else
771 					line->bbox = fz_union_rect(line->bbox, ch_box);
772 			}
773 			block->bbox = fz_union_rect(block->bbox, line->bbox);
774 		}
775 	}
776 
777 	/* TODO: smart sorting of blocks and lines in reading order */
778 	/* TODO: unicode NFC normalization */
779 }
780 
781 static void
fz_stext_drop_device(fz_context * ctx,fz_device * dev)782 fz_stext_drop_device(fz_context *ctx, fz_device *dev)
783 {
784 	fz_stext_device *tdev = (fz_stext_device*)dev;
785 	fz_drop_text(ctx, tdev->lasttext);
786 }
787 
788 fz_stext_options *
fz_parse_stext_options(fz_context * ctx,fz_stext_options * opts,const char * string)789 fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string)
790 {
791 	const char *val;
792 
793 	memset(opts, 0, sizeof *opts);
794 
795 	if (fz_has_option(ctx, string, "preserve-ligatures", &val) && fz_option_eq(val, "yes"))
796 		opts->flags |= FZ_STEXT_PRESERVE_LIGATURES;
797 	if (fz_has_option(ctx, string, "preserve-whitespace", &val) && fz_option_eq(val, "yes"))
798 		opts->flags |= FZ_STEXT_PRESERVE_WHITESPACE;
799 	if (fz_has_option(ctx, string, "preserve-images", &val) && fz_option_eq(val, "yes"))
800 		opts->flags |= FZ_STEXT_PRESERVE_IMAGES;
801 	if (fz_has_option(ctx, string, "inhibit-spaces", &val) && fz_option_eq(val, "yes"))
802 		opts->flags |= FZ_STEXT_INHIBIT_SPACES;
803 	if (fz_has_option(ctx, string, "dehyphenate", &val) && fz_option_eq(val, "yes"))
804 		opts->flags |= FZ_STEXT_DEHYPHENATE;
805 	if (fz_has_option(ctx, string, "preserve-spans", &val) && fz_option_eq(val, "yes"))
806 		opts->flags |= FZ_STEXT_PRESERVE_SPANS;
807 
808 	return opts;
809 }
810 
811 fz_device *
fz_new_stext_device(fz_context * ctx,fz_stext_page * page,const fz_stext_options * opts)812 fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *opts)
813 {
814 	fz_stext_device *dev = fz_new_derived_device(ctx, fz_stext_device);
815 
816 	dev->super.close_device = fz_stext_close_device;
817 	dev->super.drop_device = fz_stext_drop_device;
818 
819 	dev->super.fill_text = fz_stext_fill_text;
820 	dev->super.stroke_text = fz_stext_stroke_text;
821 	dev->super.clip_text = fz_stext_clip_text;
822 	dev->super.clip_stroke_text = fz_stext_clip_stroke_text;
823 	dev->super.ignore_text = fz_stext_ignore_text;
824 
825 	if (opts && (opts->flags & FZ_STEXT_PRESERVE_IMAGES))
826 	{
827 		dev->super.fill_shade = fz_stext_fill_shade;
828 		dev->super.fill_image = fz_stext_fill_image;
829 		dev->super.fill_image_mask = fz_stext_fill_image_mask;
830 	}
831 
832 	if (opts)
833 		dev->flags = opts->flags;
834 	dev->page = page;
835 	dev->pen.x = 0;
836 	dev->pen.y = 0;
837 	dev->trm = fz_identity;
838 	dev->lastchar = ' ';
839 	dev->curdir = 1;
840 	dev->lasttext = NULL;
841 
842 	return (fz_device*)dev;
843 }
844