1 #include "mupdf/fitz.h"
2 
3 #include <string.h>
4 #include <limits.h>
5 
6 #if !defined(HAVE_LEPTONICA) || !defined(HAVE_TESSERACT)
7 #ifndef OCR_DISABLED
8 #define OCR_DISABLED
9 #endif
10 #endif
11 
12 #ifdef OCR_DISABLED
13 
14 /* In non-OCR builds, we need to define this otherwise SWIG Python gets SEGV
15 when it attempts to import mupdf.py and _mupdf.py. */
16 const char *fz_pdfocr_write_options_usage = "";
17 
18 #else
19 
20 #include "tessocr.h"
21 
22 const char *fz_pdfocr_write_options_usage =
23 	"PDFOCR output options:\n"
24 	"\tcompression=none: No compression (default)\n"
25 	"\tcompression=flate: Flate compression\n"
26 	"\tstrip-height=N: Strip height (default 0=fullpage)\n"
27 	"\tocr-language=<lang>: OCR language (default=eng)\n"
28 	"\n";
29 
30 static const char funky_font[] =
31 "3 0 obj\n<</BaseFont/GlyphLessFont/DescendantFonts[4 0 R]"
32 "/Encoding/Identity-H/Subtype/Type0/ToUnicode 6 0 R/Type/Font"
33 ">>\nendobj\n";
34 
35 static const char funky_font2[] =
36 "4 0 obj\n"
37 "<</BaseFont/GlyphLessFont/CIDToGIDMap 5 0 R"
38 "/CIDSystemInfo<</Ordering (Identity)/Registry (Adobe)/Supplement 0>>"
39 "/FontDescriptor 7 0 R/Subtype/CIDFontType2/Type/Font/DW 500>>"
40 "\nendobj\n";
41 
42 static const char funky_font3[] =
43 "5 0 obj\n<</Length 210/Filter/FlateDecode>>\nstream\n"
44 "\x78\x9c\xec\xc2\x01\x09\x00\x00\x00\x02\xa0\xfa\x7f\xba\x21\x89"
45 "\xa6\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
46 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
47 "\x80\x7b\x03\x00\x00\xff\xff\xec\xc2\x01\x0d\x00\x00\x00\xc2\x20"
48 "\xdf\xbf\xb4\x45\x18\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
49 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
50 "\x00\x00\x00\x00\x00\xeb\x00\x00\x00\xff\xff\xec\xc2\x01\x0d\x00"
51 "\x00\x00\xc2\x20\xdf\xbf\xb4\x45\x18\x00\x00\x00\x00\x00\x00\x00"
52 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
53 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\xeb\x00\x00\x00\xff\xff\xed"
54 "\xc2\x01\x0d\x00\x00\x00\xc2\x20\xdf\xbf\xb4\x45\x18\x00\x00\x00"
55 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
56 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xeb\x00\xff"
57 "\x00\x10"
58 "\nendstream\nendobj\n";
59 
60 static const char funky_font4[] =
61 "6 0 obj\n<</Length 353>>\nstream\n"
62 "/CIDInit /ProcSet findresource begin\n"
63 "12 dict begin\n"
64 "begincmap\n"
65 "/CIDSystemInfo\n"
66 "<<\n"
67 "  /Registry (Adobe)\n"
68 "  /Ordering (UCS)\n"
69 "  /Supplement 0\n"
70 ">> def\n"
71 "/CMapName /Adobe-Identity-UCS def\n"
72 "/CMapType 2 def\n"
73 "1 begincodespacerange\n"
74 "<0000> <FFFF>\n"
75 "endcodespacerange\n"
76 "1 beginbfrange\n"
77 "<0000> <FFFF> <0000>\n"
78 "endbfrange\n"
79 "endcmap\n"
80 "CMapName currentdict /CMap defineresource pop\n"
81 "end\n"
82 "end\n"
83 "endstream\n"
84 "endobj\n";
85 
86 static const char funky_font5[] =
87 "7 0 obj\n"
88 "<</Ascent 1000/CapHeight 1000/Descent -1/Flags 5"
89 "/FontBBox[0 0 500 1000]/FontFile2 8 0 R/FontName/GlyphLessFont"
90 "/ItalicAngle 0/StemV 80/Type/FontDescriptor>>\nendobj\n";
91 
92 static const char funky_font6[] =
93 "8 0 obj\n<</Length 572/Length1 572>>\nstream\n"
94 "\x00\x01\x00\x00\x00\x0a\x00\x80\x00\x03\x00\x20\x4f\x53\x2f\x32"
95 "\x56\xde\xc8\x94\x00\x00\x01\x28\x00\x00\x00\x60\x63\x6d\x61\x70"
96 "\x00\x0a\x00\x34\x00\x00\x01\x90\x00\x00\x00\x1e\x67\x6c\x79\x66"
97 "\x15\x22\x41\x24\x00\x00\x01\xb8\x00\x00\x00\x18\x68\x65\x61\x64"
98 "\x0b\x78\xf1\x65\x00\x00\x00\xac\x00\x00\x00\x36\x68\x68\x65\x61"
99 "\x0c\x02\x04\x02\x00\x00\x00\xe4\x00\x00\x00\x24\x68\x6d\x74\x78"
100 "\x04\x00\x00\x00\x00\x00\x01\x88\x00\x00\x00\x08\x6c\x6f\x63\x61"
101 "\x00\x0c\x00\x00\x00\x00\x01\xb0\x00\x00\x00\x06\x6d\x61\x78\x70"
102 "\x00\x04\x00\x05\x00\x00\x01\x08\x00\x00\x00\x20\x6e\x61\x6d\x65"
103 "\xf2\xeb\x16\xda\x00\x00\x01\xd0\x00\x00\x00\x4b\x70\x6f\x73\x74"
104 "\x00\x01\x00\x01\x00\x00\x02\x1c\x00\x00\x00\x20\x00\x01\x00\x00"
105 "\x00\x01\x00\x00\xb0\x94\x71\x10\x5f\x0f\x3c\xf5\x04\x07\x08\x00"
106 "\x00\x00\x00\x00\xcf\x9a\xfc\x6e\x00\x00\x00\x00\xd4\xc3\xa7\xf2"
107 "\x00\x00\x00\x00\x04\x00\x08\x00\x00\x00\x00\x10\x00\x02\x00\x00"
108 "\x00\x00\x00\x00\x00\x01\x00\x00\x08\x00\xff\xff\x00\x00\x04\x00"
109 "\x00\x00\x00\x00\x04\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00"
110 "\x00\x00\x00\x00\x00\x00\x00\x02\x00\x01\x00\x00\x00\x02\x00\x04"
111 "\x00\x01\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00"
112 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x01\x90\x00\x05"
113 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
114 "\x00\x00\x00\x00\x00\x00\x00\x00\x05\x00\x01\x00\x01\x00\x00\x00"
115 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
116 "\x00\x00\x47\x4f\x4f\x47\x00\x40\x00\x00\x00\x00\x00\x01\xff\xff"
117 "\x00\x00\x00\x01\x00\x01\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00"
118 "\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00"
119 "\x00\x00\x00\x02\x00\x01\x00\x00\x00\x00\x00\x14\x00\x03\x00\x00"
120 "\x00\x00\x00\x14\x00\x06\x00\x0a\x00\x00\x00\x00\x00\x00\x00\x00"
121 "\x00\x00\x00\x00\x00\x0c\x00\x00\x00\x01\x00\x00\x00\x00\x04\x00"
122 "\x08\x00\x00\x03\x00\x00\x31\x21\x11\x21\x04\x00\xfc\x00\x08\x00"
123 "\x00\x00\x00\x03\x00\x2a\x00\x00\x00\x03\x00\x00\x00\x05\x00\x16"
124 "\x00\x00\x00\x01\x00\x00\x00\x00\x00\x05\x00\x0b\x00\x16\x00\x03"
125 "\x00\x01\x04\x09\x00\x05\x00\x16\x00\x00\x00\x56\x00\x65\x00\x72"
126 "\x00\x73\x00\x69\x00\x6f\x00\x6e\x00\x20\x00\x31\x00\x2e\x00\x30"
127 "\x56\x65\x72\x73\x69\x6f\x6e\x20\x31\x2e\x30\x00\x00\x01\x00\x00"
128 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00"
129 "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
130 "\nendstream\nendobj\n";
131 
132 #endif
133 
134 fz_pdfocr_options *
fz_parse_pdfocr_options(fz_context * ctx,fz_pdfocr_options * opts,const char * args)135 fz_parse_pdfocr_options(fz_context *ctx, fz_pdfocr_options *opts, const char *args)
136 {
137 #ifdef OCR_DISABLED
138 	fz_throw(ctx, FZ_ERROR_GENERIC, "No OCR support in this build");
139 #else
140 	const char *val;
141 
142 	memset(opts, 0, sizeof *opts);
143 
144 	if (fz_has_option(ctx, args, "compression", &val))
145 	{
146 		if (fz_option_eq(val, "none"))
147 			opts->compress = 0;
148 		else if (fz_option_eq(val, "flate"))
149 			opts->compress = 1;
150 		else
151 			fz_throw(ctx, FZ_ERROR_GENERIC, "Unsupported PDFOCR compression %s (none, or flate only)", val);
152 	}
153 	if (fz_has_option(ctx, args, "strip-height", &val))
154 	{
155 		int i = fz_atoi(val);
156 		if (i <= 0)
157 			fz_throw(ctx, FZ_ERROR_GENERIC, "Unsupported PDFOCR strip height %d (suggest 0)", i);
158 		opts->strip_height = i;
159 	}
160 	if (fz_has_option(ctx, args, "ocr-language", &val))
161 	{
162 		fz_strlcpy(opts->language, val, sizeof(opts->language));
163 	}
164 
165 	return opts;
166 #endif
167 }
168 
169 void
fz_write_pixmap_as_pdfocr(fz_context * ctx,fz_output * out,const fz_pixmap * pixmap,const fz_pdfocr_options * pdfocr)170 fz_write_pixmap_as_pdfocr(fz_context *ctx, fz_output *out, const fz_pixmap *pixmap, const fz_pdfocr_options *pdfocr)
171 {
172 #ifdef OCR_DISABLED
173 	fz_throw(ctx, FZ_ERROR_GENERIC, "No OCR support in this build");
174 #else
175 	fz_band_writer *writer;
176 
177 	if (!pixmap || !out)
178 		return;
179 
180 	writer = fz_new_pdfocr_band_writer(ctx, out, pdfocr);
181 	fz_try(ctx)
182 	{
183 		fz_write_header(ctx, writer, pixmap->w, pixmap->h, pixmap->n, pixmap->alpha, pixmap->xres, pixmap->yres, 0, pixmap->colorspace, pixmap->seps);
184 		fz_write_band(ctx, writer, pixmap->stride, pixmap->h, pixmap->samples);
185 	}
186 	fz_always(ctx)
187 		fz_drop_band_writer(ctx, writer);
188 	fz_catch(ctx)
189 		fz_rethrow(ctx);
190 #endif
191 }
192 
193 #ifndef OCR_DISABLED
194 typedef struct pdfocr_band_writer_s
195 {
196 	fz_band_writer super;
197 	fz_pdfocr_options options;
198 
199 	int obj_num;
200 	int xref_max;
201 	int64_t *xref;
202 	int pages;
203 	int page_max;
204 	int *page_obj;
205 	unsigned char *stripbuf;
206 	unsigned char *compbuf;
207 	size_t complen;
208 
209 	void *tessapi;
210 	fz_pixmap *ocrbitmap;
211 } pdfocr_band_writer;
212 
213 static int
new_obj(fz_context * ctx,pdfocr_band_writer * writer)214 new_obj(fz_context *ctx, pdfocr_band_writer *writer)
215 {
216 	int64_t pos = fz_tell_output(ctx, writer->super.out);
217 
218 	if (writer->obj_num >= writer->xref_max)
219 	{
220 		int new_max = writer->xref_max * 2;
221 		if (new_max < writer->obj_num + 8)
222 			new_max = writer->obj_num + 8;
223 		writer->xref = fz_realloc_array(ctx, writer->xref, new_max, int64_t);
224 		writer->xref_max = new_max;
225 	}
226 
227 	writer->xref[writer->obj_num] = pos;
228 
229 	return writer->obj_num++;
230 }
231 
232 static void
pdfocr_write_header(fz_context * ctx,fz_band_writer * writer_,fz_colorspace * cs)233 pdfocr_write_header(fz_context *ctx, fz_band_writer *writer_, fz_colorspace *cs)
234 {
235 	pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_;
236 	fz_output *out = writer->super.out;
237 	int w = writer->super.w;
238 	int h = writer->super.h;
239 	int n = writer->super.n;
240 	int s = writer->super.s;
241 	int a = writer->super.alpha;
242 	int xres = writer->super.xres;
243 	int yres = writer->super.yres;
244 	int sh = writer->options.strip_height;
245 	int strips;
246 	int i;
247 
248 	if (sh == 0)
249 		sh = h;
250 	strips = (h + sh-1)/sh;
251 
252 	if (a != 0)
253 		fz_throw(ctx, FZ_ERROR_GENERIC, "PDFOCR cannot write alpha channel");
254 	if (s != 0)
255 		fz_throw(ctx, FZ_ERROR_GENERIC, "PDFOCR cannot write spot colors");
256 	if (n != 3 && n != 1)
257 		fz_throw(ctx, FZ_ERROR_GENERIC, "PDFOCR expected to be Grayscale or RGB");
258 
259 	fz_free(ctx, writer->stripbuf);
260 	writer->stripbuf = NULL;
261 	fz_free(ctx, writer->compbuf);
262 	writer->compbuf = NULL;
263 	fz_drop_pixmap(ctx, writer->ocrbitmap);
264 	writer->ocrbitmap = NULL;
265 	writer->stripbuf = Memento_label(fz_malloc(ctx, (size_t)w * sh * n), "pdfocr_stripbuf");
266 	writer->complen = fz_deflate_bound(ctx, (size_t)w * sh * n);
267 	writer->compbuf = Memento_label(fz_malloc(ctx, writer->complen), "pdfocr_compbuf");
268 	/* Always round the width of ocrbitmap up to a multiple of 4. */
269 	writer->ocrbitmap = fz_new_pixmap(ctx, NULL, (w+3)&~3, h, NULL, 0);
270 	fz_set_pixmap_resolution(ctx, writer->ocrbitmap, xres, yres);
271 
272 	/* Send the file header on the first page */
273 	if (writer->pages == 0)
274 	{
275 		fz_write_string(ctx, out, "%PDF-1.4\n%PDFOCR-1.0\n");
276 
277 		if (writer->xref_max < 9)
278 		{
279 			int new_max = 9;
280 			writer->xref = fz_realloc_array(ctx, writer->xref, new_max, int64_t);
281 			writer->xref_max = new_max;
282 		}
283 		writer->xref[3] = fz_tell_output(ctx, out);
284 		fz_write_data(ctx, out, funky_font,  sizeof(funky_font)-1);
285 		writer->xref[4] = fz_tell_output(ctx, out);
286 		fz_write_data(ctx, out, funky_font2, sizeof(funky_font2)-1);
287 		writer->xref[5] = fz_tell_output(ctx, out);
288 		fz_write_data(ctx, out, funky_font3, sizeof(funky_font3)-1);
289 		writer->xref[6] = fz_tell_output(ctx, out);
290 		fz_write_data(ctx, out, funky_font4, sizeof(funky_font4)-1);
291 		writer->xref[7] = fz_tell_output(ctx, out);
292 		fz_write_data(ctx, out, funky_font5, sizeof(funky_font5)-1);
293 		writer->xref[8] = fz_tell_output(ctx, out);
294 		fz_write_data(ctx, out, funky_font6, sizeof(funky_font6)-1);
295 	}
296 
297 	if (writer->page_max <= writer->pages)
298 	{
299 		int new_max = writer->page_max * 2;
300 		if (new_max == 0)
301 			new_max = writer->pages + 8;
302 		writer->page_obj = fz_realloc_array(ctx, writer->page_obj, new_max, int);
303 		writer->page_max = new_max;
304 	}
305 	writer->page_obj[writer->pages] = writer->obj_num;
306 	writer->pages++;
307 
308 	/* Send the Page Object */
309 	fz_write_printf(ctx, out, "%d 0 obj\n<</Type/Page/Parent 2 0 R/Resources<</XObject<<", new_obj(ctx, writer));
310 	for (i = 0; i < strips; i++)
311 		fz_write_printf(ctx, out, "/I%d %d 0 R", i, writer->obj_num + i);
312 	fz_write_printf(ctx, out, ">>/Font<</F0 3 0 R>>>>/MediaBox[0 0 %g %g]/Contents %d 0 R>>\nendobj\n",
313 		w * 72.0f / xres, h * 72.0f / yres, writer->obj_num + strips);
314 }
315 
316 static void
flush_strip(fz_context * ctx,pdfocr_band_writer * writer,int fill)317 flush_strip(fz_context *ctx, pdfocr_band_writer *writer, int fill)
318 {
319 	unsigned char *data = writer->stripbuf;
320 	fz_output *out = writer->super.out;
321 	int w = writer->super.w;
322 	int n = writer->super.n;
323 	size_t len = (size_t)w*n*fill;
324 
325 	/* Buffer is full, compress it and write it. */
326 	if (writer->options.compress)
327 	{
328 		size_t destLen = writer->complen;
329 		fz_deflate(ctx, writer->compbuf, &destLen, data, len, FZ_DEFLATE_DEFAULT);
330 		len = destLen;
331 		data = writer->compbuf;
332 	}
333 	fz_write_printf(ctx, out, "%d 0 obj\n<</Width %d/ColorSpace/Device%s/Height %d%s/Subtype/Image",
334 		new_obj(ctx, writer), w, n == 1 ? "Gray" : "RGB", fill, writer->options.compress ? "/Filter/FlateDecode" : "");
335 	fz_write_printf(ctx, out, "/Length %zd/Type/XObject/BitsPerComponent 8>>\nstream\n", len);
336 	fz_write_data(ctx, out, data, len);
337 	fz_write_string(ctx, out, "\nendstream\nendobj\n");
338 }
339 
340 static void
pdfocr_write_band(fz_context * ctx,fz_band_writer * writer_,int stride,int band_start,int band_height,const unsigned char * sp)341 pdfocr_write_band(fz_context *ctx, fz_band_writer *writer_, int stride, int band_start, int band_height, const unsigned char *sp)
342 {
343 	pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_;
344 	fz_output *out = writer->super.out;
345 	int w = writer->super.w;
346 	int h = writer->super.h;
347 	int n = writer->super.n;
348 	int sh = writer->options.strip_height;
349 	int line;
350 	unsigned char *d = writer->ocrbitmap->samples;
351 
352 	if (!out)
353 		return;
354 
355 	if (sh == 0)
356 		sh = h;
357 
358 	for (line = 0; line < band_height; line++)
359 	{
360 		int dstline = (band_start+line) % sh;
361 		memcpy(writer->stripbuf + (size_t)w*n*dstline,
362 			   sp + (size_t)line * w * n,
363 			   (size_t)w * n);
364 		if (dstline+1 == sh)
365 			flush_strip(ctx, writer, dstline+1);
366 	}
367 
368 	if (band_start + band_height == h && h % sh != 0)
369 		flush_strip(ctx, writer, h % sh);
370 
371 	/* Copy strip to ocrbitmap, converting if required. */
372 	d += band_start*w;
373 	if (n == 1)
374 	{
375 		int y;
376 		for (y = band_height; y > 0; y--)
377 		{
378 			memcpy(d, sp, w);
379 			if (writer->ocrbitmap->w - w)
380 				memset(d + w, 0, writer->ocrbitmap->w - w);
381 			d += writer->ocrbitmap->w;
382 		}
383 	}
384 	else
385 	{
386 		int x, y;
387 		for (y = band_height; y > 0; y--)
388 		{
389 			for (x = w; x > 0; x--)
390 			{
391 				*d++ = (sp[0] + 2*sp[1] + sp[2] + 2)>>2;
392 				sp += 3;
393 			}
394 			for (x = writer->ocrbitmap->w - w; x > 0; x--)
395 				*d++ = 0;
396 		}
397 	}
398 }
399 
400 typedef struct
401 {
402 	fz_buffer *buf;
403 	pdfocr_band_writer *writer;
404 	int word_max;
405 	int word_len;
406 	int *word_chars;
407 	float word_bbox[4];
408 	float cur_size;
409 	float cur_scale;
410 	float tx, ty;
411 } char_callback_data_t;
412 
413 static void
flush_word(fz_context * ctx,char_callback_data_t * cb)414 flush_word(fz_context *ctx,
415 	char_callback_data_t *cb)
416 {
417 	float size = cb->word_bbox[3] - cb->word_bbox[1];
418 	float scale;
419 	int i, len = cb->word_len;
420 	float x, y;
421 
422 	if (cb->word_len == 0 || size == 0)
423 		return;
424 
425 	if (size != cb->cur_size)
426 	{
427 		fz_append_printf(ctx, cb->buf, "/F0 %g Tf\n", size);
428 		cb->cur_size = size;
429 	}
430 	scale = (cb->word_bbox[2] - cb->word_bbox[0]) / size / len * 200;
431 	if (scale != cb->cur_scale)
432 	{
433 		fz_append_printf(ctx, cb->buf, "%d Tz\n", (int)scale);
434 		cb->cur_scale = scale;
435 	}
436 
437 	x = cb->word_bbox[0];
438 	y = cb->word_bbox[1];
439 	fz_append_printf(ctx, cb->buf, "%g %g Td\n", x-cb->tx, y-cb->ty);
440 	cb->tx = x;
441 	cb->ty = y;
442 
443 	fz_append_printf(ctx, cb->buf, "<");
444 	for (i = 0; i < len; i++)
445 		fz_append_printf(ctx, cb->buf, "%04x", cb->word_chars[i]);
446 	fz_append_printf(ctx, cb->buf, ">Tj\n");
447 
448 	cb->word_len = 0;
449 }
450 
451 static void
char_callback(fz_context * ctx,void * arg,int unicode,const char * font_name,const int * line_bbox,const int * word_bbox,const int * char_bbox,int pointsize)452 char_callback(fz_context *ctx, void *arg, int unicode,
453 		const char *font_name,
454 		const int *line_bbox, const int *word_bbox,
455 		const int *char_bbox, int pointsize)
456 {
457 	char_callback_data_t *cb = (char_callback_data_t *)arg;
458 	pdfocr_band_writer *writer = cb->writer;
459 	float bbox[4];
460 
461 	bbox[0] = word_bbox[0] * 72.0f / cb->writer->ocrbitmap->xres;
462 	bbox[3] = (writer->ocrbitmap->h - 1 - word_bbox[1]) * 72.0f / cb->writer->ocrbitmap->yres;
463 	bbox[2] = word_bbox[2] * 72.0f / cb->writer->ocrbitmap->yres;
464 	bbox[1] = (writer->ocrbitmap->h - 1 - word_bbox[3]) * 72.0f / cb->writer->ocrbitmap->yres;
465 
466 	if (bbox[0] != cb->word_bbox[0] ||
467 		bbox[1] != cb->word_bbox[1] ||
468 		bbox[2] != cb->word_bbox[2] ||
469 		bbox[3] != cb->word_bbox[3])
470 	{
471 		flush_word(ctx, cb);
472 		memcpy(cb->word_bbox, bbox, 4 * sizeof(float));
473 	}
474 
475 	if (cb->word_max == cb->word_len)
476 	{
477 		int newmax = cb->word_max * 2;
478 		if (newmax == 0)
479 			newmax = 16;
480 		cb->word_chars = fz_realloc_array(ctx, cb->word_chars, newmax, int);
481 		cb->word_max = newmax;
482 	}
483 
484 	cb->word_chars[cb->word_len++] = unicode;
485 }
486 
487 static void
pdfocr_write_trailer(fz_context * ctx,fz_band_writer * writer_)488 pdfocr_write_trailer(fz_context *ctx, fz_band_writer *writer_)
489 {
490 	pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_;
491 	fz_output *out = writer->super.out;
492 	int w = writer->super.w;
493 	int h = writer->super.h;
494 	int xres = writer->super.xres;
495 	int yres = writer->super.yres;
496 	int sh = writer->options.strip_height;
497 	int strips;
498 	int i;
499 	size_t len;
500 	unsigned char *data;
501 	fz_buffer *buf = NULL;
502 	char_callback_data_t cb = { NULL };
503 
504 	if (sh == 0)
505 		sh = h;
506 	strips = (h + sh-1)/sh;
507 
508 	/* Send the Page contents */
509 	/* We need the length to this, so write to a buffer first */
510 	fz_var(buf);
511 	fz_var(cb);
512 	fz_try(ctx)
513 	{
514 		cb.writer = writer;
515 		cb.buf = buf = fz_new_buffer(ctx, 0);
516 		fz_append_printf(ctx, buf, "q\n%g 0 0 %g 0 0 cm\n", 72.0f/xres, 72.0f/yres);
517 		for (i = 0; i < strips; i++)
518 		{
519 			int at = h - (i+1)*sh;
520 			int this_sh = sh;
521 			if (at < 0)
522 			{
523 				this_sh += at;
524 				at = 0;
525 			}
526 			fz_append_printf(ctx, buf, "/P <</MCID 0>> BDC\nq\n%d 0 0 %d 0 %d cm\n/I%d Do\nQ\n",
527 				w, this_sh, at, i);
528 		}
529 
530 		fz_append_printf(ctx, buf, "Q\nBT\n3 Tr\n");
531 
532 		ocr_recognise(ctx, writer->tessapi, writer->ocrbitmap, char_callback, &cb);
533 		flush_word(ctx, &cb);
534 		fz_append_printf(ctx, buf, "ET\n");
535 
536 		len = fz_buffer_storage(ctx, buf, &data);
537 		fz_write_printf(ctx, out, "%d 0 obj\n<</Length %zd>>\nstream\n", new_obj(ctx, writer), len);
538 		fz_write_data(ctx, out, data, len);
539 		fz_drop_buffer(ctx, buf);
540 		buf = NULL;
541 		fz_write_string(ctx, out, "\nendstream\nendobj\n");
542 	}
543 	fz_always(ctx)
544 	{
545 		fz_free(ctx, cb.word_chars);
546 	}
547 	fz_catch(ctx)
548 	{
549 		fz_drop_buffer(ctx, buf);
550 		fz_rethrow(ctx);
551 	}
552 }
553 
554 static void
pdfocr_drop_band_writer(fz_context * ctx,fz_band_writer * writer_)555 pdfocr_drop_band_writer(fz_context *ctx, fz_band_writer *writer_)
556 {
557 	pdfocr_band_writer *writer = (pdfocr_band_writer *)writer_;
558 	fz_output *out = writer->super.out;
559 	int i;
560 
561 	/* We actually do the trailer writing in the drop */
562 	if (writer->xref_max > 2)
563 	{
564 		int64_t t_pos;
565 
566 		/* Catalog */
567 		writer->xref[1] = fz_tell_output(ctx, out);
568 		fz_write_printf(ctx, out, "1 0 obj\n<</Type/Catalog/Pages 2 0 R>>\nendobj\n");
569 
570 		/* Page table */
571 		writer->xref[2] = fz_tell_output(ctx, out);
572 		fz_write_printf(ctx, out, "2 0 obj\n<</Count %d/Kids[", writer->pages);
573 
574 		for (i = 0; i < writer->pages; i++)
575 		{
576 			if (i > 0)
577 				fz_write_byte(ctx, out, ' ');
578 			fz_write_printf(ctx, out, "%d 0 R", writer->page_obj[i]);
579 		}
580 		fz_write_string(ctx, out, "]/Type/Pages>>\nendobj\n");
581 
582 		/* Xref */
583 		t_pos = fz_tell_output(ctx, out);
584 		fz_write_printf(ctx, out, "xref\n0 %d\n0000000000 65535 f \n", writer->obj_num);
585 		for (i = 1; i < writer->obj_num; i++)
586 			fz_write_printf(ctx, out, "%010zd 00000 n \n", writer->xref[i]);
587 		fz_write_printf(ctx, out, "trailer\n<</Size %d/Root 1 0 R>>\nstartxref\n%ld\n%%%%EOF\n", writer->obj_num, t_pos);
588 	}
589 
590 	fz_free(ctx, writer->stripbuf);
591 	fz_free(ctx, writer->compbuf);
592 	fz_free(ctx, writer->page_obj);
593 	fz_free(ctx, writer->xref);
594 	fz_drop_pixmap(ctx, writer->ocrbitmap);
595 
596 	ocr_fin(ctx, writer->tessapi);
597 }
598 #endif
599 
fz_new_pdfocr_band_writer(fz_context * ctx,fz_output * out,const fz_pdfocr_options * options)600 fz_band_writer *fz_new_pdfocr_band_writer(fz_context *ctx, fz_output *out, const fz_pdfocr_options *options)
601 {
602 #ifdef OCR_DISABLED
603 	fz_throw(ctx, FZ_ERROR_GENERIC, "No OCR support in this build");
604 #else
605 	pdfocr_band_writer *writer = fz_new_band_writer(ctx, pdfocr_band_writer, out);
606 
607 	writer->super.header = pdfocr_write_header;
608 	writer->super.band = pdfocr_write_band;
609 	writer->super.trailer = pdfocr_write_trailer;
610 	writer->super.drop = pdfocr_drop_band_writer;
611 
612 	if (options)
613 		writer->options = *options;
614 	else
615 		memset(&writer->options, 0, sizeof(writer->options));
616 
617 	/* Objects:
618 	 *  1 reserved for catalog
619 	 *  2 for pages tree
620 	 *  3 font
621 	 *  4 cidfont
622 	 *  5 cid to gid map
623 	 *  6 tounicode
624 	 *  7 font descriptor
625 	 *  8 font file
626 	 */
627 	writer->obj_num = 9;
628 
629 	fz_try(ctx)
630 	{
631 		writer->tessapi = ocr_init(ctx, writer->options.language);
632 	}
633 	fz_catch(ctx)
634 	{
635 		fz_drop_band_writer(ctx, &writer->super);
636 		fz_throw(ctx, FZ_ERROR_GENERIC, "OCR initialisation failed");
637 	}
638 
639 	return &writer->super;
640 #endif
641 }
642 
643 void
fz_save_pixmap_as_pdfocr(fz_context * ctx,fz_pixmap * pixmap,char * filename,int append,const fz_pdfocr_options * pdfocr)644 fz_save_pixmap_as_pdfocr(fz_context *ctx, fz_pixmap *pixmap, char *filename, int append, const fz_pdfocr_options *pdfocr)
645 {
646 #ifdef OCR_DISABLED
647 	fz_throw(ctx, FZ_ERROR_GENERIC, "No OCR support in this build");
648 #else
649 	fz_output *out = fz_new_output_with_path(ctx, filename, append);
650 	fz_try(ctx)
651 	{
652 		fz_write_pixmap_as_pdfocr(ctx, out, pixmap, pdfocr);
653 		fz_close_output(ctx, out);
654 	}
655 	fz_always(ctx)
656 		fz_drop_output(ctx, out);
657 	fz_catch(ctx)
658 		fz_rethrow(ctx);
659 #endif
660 }
661 
662 /* High-level document writer interface */
663 
664 #ifndef OCR_DISABLED
665 typedef struct
666 {
667 	fz_document_writer super;
668 	fz_draw_options draw;
669 	fz_pdfocr_options pdfocr;
670 	fz_pixmap *pixmap;
671 	fz_band_writer *bander;
672 	fz_output *out;
673 	int pagenum;
674 } fz_pdfocr_writer;
675 
676 static fz_device *
pdfocr_begin_page(fz_context * ctx,fz_document_writer * wri_,fz_rect mediabox)677 pdfocr_begin_page(fz_context *ctx, fz_document_writer *wri_, fz_rect mediabox)
678 {
679 	fz_pdfocr_writer *wri = (fz_pdfocr_writer*)wri_;
680 	return fz_new_draw_device_with_options(ctx, &wri->draw, mediabox, &wri->pixmap);
681 }
682 
683 static void
pdfocr_end_page(fz_context * ctx,fz_document_writer * wri_,fz_device * dev)684 pdfocr_end_page(fz_context *ctx, fz_document_writer *wri_, fz_device *dev)
685 {
686 	fz_pdfocr_writer *wri = (fz_pdfocr_writer*)wri_;
687 	fz_pixmap *pix = wri->pixmap;
688 
689 	fz_try(ctx)
690 	{
691 		fz_close_device(ctx, dev);
692 		fz_write_header(ctx, wri->bander, pix->w, pix->h, pix->n, pix->alpha, pix->xres, pix->yres, wri->pagenum++, pix->colorspace, pix->seps);
693 		fz_write_band(ctx, wri->bander, pix->stride, pix->h, pix->samples);
694 	}
695 	fz_always(ctx)
696 	{
697 		fz_drop_device(ctx, dev);
698 		fz_drop_pixmap(ctx, pix);
699 		wri->pixmap = NULL;
700 	}
701 	fz_catch(ctx)
702 		fz_rethrow(ctx);
703 }
704 
705 static void
pdfocr_close_writer(fz_context * ctx,fz_document_writer * wri_)706 pdfocr_close_writer(fz_context *ctx, fz_document_writer *wri_)
707 {
708 	fz_pdfocr_writer *wri = (fz_pdfocr_writer*)wri_;
709 
710 	fz_drop_band_writer(ctx, wri->bander);
711 	wri->bander = NULL;
712 
713 	fz_close_output(ctx, wri->out);
714 }
715 
716 static void
pdfocr_drop_writer(fz_context * ctx,fz_document_writer * wri_)717 pdfocr_drop_writer(fz_context *ctx, fz_document_writer *wri_)
718 {
719 	fz_pdfocr_writer *wri = (fz_pdfocr_writer*)wri_;
720 
721 	fz_drop_pixmap(ctx, wri->pixmap);
722 	fz_drop_output(ctx, wri->out);
723 	fz_drop_band_writer(ctx, wri->bander);
724 }
725 #endif
726 
727 fz_document_writer *
fz_new_pdfocr_writer_with_output(fz_context * ctx,fz_output * out,const char * options)728 fz_new_pdfocr_writer_with_output(fz_context *ctx, fz_output *out, const char *options)
729 {
730 #ifdef OCR_DISABLED
731 	fz_throw(ctx, FZ_ERROR_GENERIC, "No OCR support in this build");
732 #else
733 	fz_pdfocr_writer *wri = fz_new_derived_document_writer(ctx, fz_pdfocr_writer, pdfocr_begin_page, pdfocr_end_page, pdfocr_close_writer, pdfocr_drop_writer);
734 
735 	fz_try(ctx)
736 	{
737 		fz_parse_draw_options(ctx, &wri->draw, options);
738 		fz_parse_pdfocr_options(ctx, &wri->pdfocr, options);
739 		wri->out = out;
740 		wri->bander = fz_new_pdfocr_band_writer(ctx, wri->out, &wri->pdfocr);
741 	}
742 	fz_catch(ctx)
743 	{
744 		fz_free(ctx, wri);
745 		fz_rethrow(ctx);
746 	}
747 
748 	return (fz_document_writer*)wri;
749 #endif
750 }
751 
752 fz_document_writer *
fz_new_pdfocr_writer(fz_context * ctx,const char * path,const char * options)753 fz_new_pdfocr_writer(fz_context *ctx, const char *path, const char *options)
754 {
755 #ifdef OCR_DISABLED
756 	fz_throw(ctx, FZ_ERROR_GENERIC, "No OCR support in this build");
757 #else
758 	fz_output *out = fz_new_output_with_path(ctx, path ? path : "out.pdfocr", 0);
759 	fz_document_writer *wri = NULL;
760 	fz_try(ctx)
761 		wri = fz_new_pdfocr_writer_with_output(ctx, out, options);
762 	fz_catch(ctx)
763 	{
764 		fz_drop_output(ctx, out);
765 		fz_rethrow(ctx);
766 	}
767 	return wri;
768 #endif
769 }
770