1 /*====================================================================*
2 - Copyright (C) 2001 Leptonica. All rights reserved.
3 -
4 - Redistribution and use in source and binary forms, with or without
5 - modification, are permitted provided that the following conditions
6 - are met:
7 - 1. Redistributions of source code must retain the above copyright
8 - notice, this list of conditions and the following disclaimer.
9 - 2. Redistributions in binary form must reproduce the above
10 - copyright notice, this list of conditions and the following
11 - disclaimer in the documentation and/or other materials
12 - provided with the distribution.
13 -
14 - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15 - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16 - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17 - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY
18 - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23 - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *====================================================================*/
26
27 /*!
28 * \file pdfio1.c
29 * <pre>
30 *
31 * Higher-level operations for generating pdf.
32 *
33 * |=============================================================|
34 * | Important note |
35 * |=============================================================|
36 * | Some of these functions require libtiff, libjpeg, and libz |
37 * | If you do not have these libraries, you must set |
38 * | #define USE_PDFIO 0 |
39 * | in environ.h. This will link pdfiostub.c |
40 * |=============================================================|
41 *
42 * Set 1. These functions convert a set of image files
43 * to a multi-page pdf file, with one image on each page.
44 * All images are rendered at the same (input) resolution.
45 * The images can be specified as being in a directory, or they
46 * can be in an sarray. The output pdf can be either a file
47 * or an array of bytes in memory.
48 *
49 * Set 2. These functions are a special case of set 1, where
50 * no scaling or change in quality is requires. For jpeg and
51 * jp2k images, the bytes in each jpeg file can be directly
52 * incorporated into the output pdf, and the wrapping up of
53 * multiple image files is very fast. For non-interlaced png,
54 * the data bytes including the predictors can also be written
55 * directly into the flate pdf data. For other image formats,
56 * transcoding is required, where the image data is first
57 * decompressed and then the G4 or Flate (gzip) encodings are generated.
58 *
59 * Set 3. These functions convert a set of images in memory
60 * to a multi-page pdf, with one image on each page. The pdf
61 * output can be either a file or an array of bytes in memory.
62 *
63 * Set 4. These functions implement a pdf output "device driver"
64 * for wrapping (encoding) any number of images on a single page
65 * in pdf. The input can be either an image file or a Pix;
66 * the pdf output can be either a file or an array of bytes in memory.
67 *
68 * Set 5. These "segmented" functions take a set of image
69 * files, along with optional segmentation information, and
70 * generate a multi-page pdf file, where each page consists
71 * in general of a mixed raster pdf of image and non-image regions.
72 * The segmentation information for each page can be input as
73 * either a mask over the image parts, or as a Boxa of those
74 * regions.
75 *
76 * Set 6. These "segmented" functions convert an image and
77 * an optional Boxa of image regions into a mixed raster pdf file
78 * for the page. The input image can be either a file or a Pix.
79 *
80 * Set 7. These functions take a set of single-page pdf files
81 * and concatenates them into a multi-page pdf.
82 * The input can be a set of single page pdf files, or of
83 * pdf 'strings' in memory. The output can be either a file or
84 * an array of bytes in memory.
85 *
86 * The images in the pdf file can be rendered using a pdf viewer,
87 * such as gv, evince, xpdf or acroread.
88 *
89 * Reference on the pdf file format:
90 * http://www.adobe.com/devnet/pdf/pdf_reference_archive.html
91 *
92 * 1. Convert specified image files to pdf (one image file per page)
93 * l_int32 convertFilesToPdf()
94 * l_int32 saConvertFilesToPdf()
95 * l_int32 saConvertFilesToPdfData()
96 * l_int32 selectDefaultPdfEncoding()
97 *
98 * 2. Convert specified image files to pdf without scaling
99 * l_int32 convertUnscaledFilesToPdf()
100 * l_int32 saConvertUnscaledFilesToPdf()
101 * l_int32 saConvertUnscaledFilesToPdfData()
102 * l_int32 convertUnscaledToPdfData()
103 *
104 * 3. Convert multiple images to pdf (one image per page)
105 * l_int32 pixaConvertToPdf()
106 * l_int32 pixaConvertToPdfData()
107 *
108 * 4. Single page, multi-image converters
109 * l_int32 convertToPdf()
110 * l_int32 convertImageDataToPdf()
111 * l_int32 convertToPdfData()
112 * l_int32 convertImageDataToPdfData()
113 * l_int32 pixConvertToPdf()
114 * l_int32 pixWriteStreamPdf()
115 * l_int32 pixWriteMemPdf()
116 *
117 * 5. Segmented multi-page, multi-image converter
118 * l_int32 convertSegmentedFilesToPdf()
119 * BOXAA *convertNumberedMasksToBoxaa()
120 *
121 * 6. Segmented single page, multi-image converters
122 * l_int32 convertToPdfSegmented()
123 * l_int32 pixConvertToPdfSegmented()
124 * l_int32 convertToPdfDataSegmented()
125 * l_int32 pixConvertToPdfDataSegmented()
126 *
127 * 7. Multipage concatenation
128 * l_int32 concatenatePdf()
129 * l_int32 saConcatenatePdf()
130 * l_int32 ptraConcatenatePdf()
131 * l_int32 concatenatePdfToData()
132 * l_int32 saConcatenatePdfToData()
133 *
134 * The top-level multi-image functions can be visualized as follows:
135 * Output pdf data to file:
136 * convertToPdf() and convertImageDataToPdf()
137 * --> pixConvertToPdf()
138 * --> pixConvertToPdfData()
139 *
140 * Output pdf data to array in memory:
141 * convertToPdfData() and convertImageDataToPdfData()
142 * --> pixConvertToPdfData()
143 *
144 * The top-level segmented image functions can be visualized as follows:
145 * Output pdf data to file:
146 * convertToPdfSegmented()
147 * --> pixConvertToPdfSegmented()
148 * --> pixConvertToPdfDataSegmented()
149 *
150 * Output pdf data to array in memory:
151 * convertToPdfDataSegmented()
152 * --> pixConvertToPdfDataSegmented()
153 *
154 * For multi-page concatenation, there are three different types of input
155 * (1) directory and optional filename filter
156 * (2) sarray of filenames
157 * (3) ptra of byte arrays of pdf data
158 * and two types of output for the concatenated pdf data
159 * (1) filename
160 * (2) data array and size
161 * High-level interfaces are given for each of the six combinations.
162 *
163 * Note: When wrapping small images into pdf, it is useful to give
164 * them a relatively low resolution value, to avoid rounding errors
165 * when rendering the images. For example, if you want an image
166 * of width w pixels to be 5 inches wide on a screen, choose a
167 * resolution w/5.
168 *
169 * The very fast functions in section (2) require neither transcoding
170 * nor parsing of the compressed jpeg file. With three types of image
171 * compression, the compressed strings can be incorporated into
172 * the pdf data without decompression and re-encoding: jpeg, jp2k
173 * and png. The DCTDecode and JPXDecode filters can handle the
174 * entire jpeg and jp2k encoded string as a byte array in the pdf file.
175 * The FlateDecode filter can handle the png compressed image data,
176 * including predictors that occur as the first byte in each
177 * raster line, but it is necessary to store only the png IDAT chunk
178 * data in the pdf array. The alternative for wrapping png images
179 * is to uncompress into a raster (a pix) and then gzip the raster data.
180 * This typically results in a larger pdf file, because it doesn't
181 * use the two-dimensional png predictor. Colormaps, which are found
182 * in png PLTE chunks, must always be pulled out and included separately
183 * in the pdf. For CCITT-G4 compression, you can not simply
184 * include a tiff G4 file -- you must either parse it and extract the
185 * G4 compressed data within it, or uncompress to a raster and
186 * G4 compress again.
187 * </pre>
188 */
189
190 #include <string.h>
191 #include <math.h>
192 #include "allheaders.h"
193
194 /* --------------------------------------------*/
195 #if USE_PDFIO /* defined in environ.h */
196 /* --------------------------------------------*/
197
198 /* Typical scan resolution in ppi (pixels/inch) */
199 static const l_int32 DEFAULT_INPUT_RES = 300;
200
201
202 /*---------------------------------------------------------------------*
203 * Convert specified image files to pdf (one image file per page) *
204 *---------------------------------------------------------------------*/
205 /*!
206 * \brief convertFilesToPdf()
207 *
208 * \param[in] dirname directory name containing images
209 * \param[in] substr [optional] substring filter on filenames; can be NULL
210 * \param[in] res input resolution of all images
211 * \param[in] scalefactor scaling factor applied to each image; > 0.0
212 * \param[in] type encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
213 * L_FLATE_ENCODE, or 0 for default
214 * \param[in] quality used for JPEG only; 0 for default (75)
215 * \param[in] title [optional] pdf title; if null, taken from the first
216 * image filename
217 * \param[in] fileout pdf file of all images
218 * \return 0 if OK, 1 on error
219 *
220 * <pre>
221 * Notes:
222 * (1) If %substr is not NULL, only image filenames that contain
223 * the substring can be used. If %substr == NULL, all files
224 * in the directory are used.
225 * (2) The files in the directory, after optional filtering by
226 * the substring, are lexically sorted in increasing order
227 * before concatenation.
228 * (3) The scalefactor is applied to each image before encoding.
229 * If you enter a value <= 0.0, it will be set to 1.0.
230 * (4) Specifying one of the three encoding types for %type forces
231 * all images to be compressed with that type. Use 0 to have
232 * the type determined for each image based on depth and whether
233 * or not it has a colormap.
234 * </pre>
235 */
236 l_int32
convertFilesToPdf(const char * dirname,const char * substr,l_int32 res,l_float32 scalefactor,l_int32 type,l_int32 quality,const char * title,const char * fileout)237 convertFilesToPdf(const char *dirname,
238 const char *substr,
239 l_int32 res,
240 l_float32 scalefactor,
241 l_int32 type,
242 l_int32 quality,
243 const char *title,
244 const char *fileout)
245 {
246 l_int32 ret;
247 SARRAY *sa;
248
249 PROCNAME("convertFilesToPdf");
250
251 if (!dirname)
252 return ERROR_INT("dirname not defined", procName, 1);
253 if (!fileout)
254 return ERROR_INT("fileout not defined", procName, 1);
255
256 if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
257 return ERROR_INT("sa not made", procName, 1);
258 ret = saConvertFilesToPdf(sa, res, scalefactor, type, quality,
259 title, fileout);
260 sarrayDestroy(&sa);
261 return ret;
262 }
263
264
265 /*!
266 * \brief saConvertFilesToPdf()
267 *
268 * \param[in] sa string array of pathnames for images
269 * \param[in] res input resolution of all images
270 * \param[in] scalefactor scaling factor applied to each image; > 0.0
271 * \param[in] type encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
272 * L_FLATE_ENCODE, or 0 for default
273 * \param[in] quality used for JPEG only; 0 for default (75)
274 * \param[in] title [optional] pdf title; if null, taken from the first
275 * image filename
276 * \param[in] fileout pdf file of all images
277 * \return 0 if OK, 1 on error
278 *
279 * <pre>
280 * Notes:
281 * (1) See convertFilesToPdf().
282 * </pre>
283 */
284 l_int32
saConvertFilesToPdf(SARRAY * sa,l_int32 res,l_float32 scalefactor,l_int32 type,l_int32 quality,const char * title,const char * fileout)285 saConvertFilesToPdf(SARRAY *sa,
286 l_int32 res,
287 l_float32 scalefactor,
288 l_int32 type,
289 l_int32 quality,
290 const char *title,
291 const char *fileout)
292 {
293 l_uint8 *data;
294 l_int32 ret;
295 size_t nbytes;
296
297 PROCNAME("saConvertFilesToPdf");
298
299 if (!sa)
300 return ERROR_INT("sa not defined", procName, 1);
301
302 ret = saConvertFilesToPdfData(sa, res, scalefactor, type, quality,
303 title, &data, &nbytes);
304 if (ret) {
305 if (data) LEPT_FREE(data);
306 return ERROR_INT("pdf data not made", procName, 1);
307 }
308
309 ret = l_binaryWrite(fileout, "w", data, nbytes);
310 LEPT_FREE(data);
311 if (ret)
312 L_ERROR("pdf data not written to file\n", procName);
313 return ret;
314 }
315
316
317 /*!
318 * \brief saConvertFilesToPdfData()
319 *
320 * \param[in] sa string array of pathnames for images
321 * \param[in] res input resolution of all images
322 * \param[in] scalefactor scaling factor applied to each image; > 0.0
323 * \param[in] type encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
324 * L_FLATE_ENCODE, or 0 for default
325 * \param[in] quality used for JPEG only; 0 for default (75)
326 * \param[in] title [optional] pdf title; if null, taken from the first
327 * image filename
328 * \param[out] pdata output pdf data (of all images
329 * \param[out] pnbytes size of output pdf data
330 * \return 0 if OK, 1 on error
331 *
332 * <pre>
333 * Notes:
334 * (1) See convertFilesToPdf().
335 * </pre>
336 */
337 l_int32
saConvertFilesToPdfData(SARRAY * sa,l_int32 res,l_float32 scalefactor,l_int32 type,l_int32 quality,const char * title,l_uint8 ** pdata,size_t * pnbytes)338 saConvertFilesToPdfData(SARRAY *sa,
339 l_int32 res,
340 l_float32 scalefactor,
341 l_int32 type,
342 l_int32 quality,
343 const char *title,
344 l_uint8 **pdata,
345 size_t *pnbytes)
346 {
347 char *fname;
348 const char *pdftitle;
349 l_uint8 *imdata;
350 l_int32 i, n, ret, pagetype, npages, scaledres;
351 size_t imbytes;
352 L_BYTEA *ba;
353 PIX *pixs, *pix;
354 L_PTRA *pa_data;
355
356 PROCNAME("saConvertFilesToPdfData");
357
358 if (!pdata)
359 return ERROR_INT("&data not defined", procName, 1);
360 *pdata = NULL;
361 if (!pnbytes)
362 return ERROR_INT("&nbytes not defined", procName, 1);
363 *pnbytes = 0;
364 if (!sa)
365 return ERROR_INT("sa not defined", procName, 1);
366 if (scalefactor <= 0.0) scalefactor = 1.0;
367 if (type < 0 || type > L_FLATE_ENCODE) {
368 L_WARNING("invalid compression type; using per-page default\n",
369 procName);
370 type = 0;
371 }
372
373 /* Generate all the encoded pdf strings */
374 n = sarrayGetCount(sa);
375 pa_data = ptraCreate(n);
376 pdftitle = NULL;
377 for (i = 0; i < n; i++) {
378 if (i && (i % 10 == 0)) fprintf(stderr, ".. %d ", i);
379 fname = sarrayGetString(sa, i, L_NOCOPY);
380 if ((pixs = pixRead(fname)) == NULL) {
381 L_ERROR("image not readable from file %s\n", procName, fname);
382 continue;
383 }
384 if (!pdftitle)
385 pdftitle = (title) ? title : fname;
386 if (scalefactor != 1.0)
387 pix = pixScale(pixs, scalefactor, scalefactor);
388 else
389 pix = pixClone(pixs);
390 pixDestroy(&pixs);
391 scaledres = (l_int32)(res * scalefactor);
392 if (type != 0) {
393 pagetype = type;
394 } else if (selectDefaultPdfEncoding(pix, &pagetype) != 0) {
395 pixDestroy(&pix);
396 L_ERROR("encoding type selection failed for file %s\n",
397 procName, fname);
398 continue;
399 }
400 ret = pixConvertToPdfData(pix, pagetype, quality, &imdata, &imbytes,
401 0, 0, scaledres, pdftitle, NULL, 0);
402 pixDestroy(&pix);
403 if (ret) {
404 LEPT_FREE(imdata);
405 L_ERROR("pdf encoding failed for %s\n", procName, fname);
406 continue;
407 }
408 ba = l_byteaInitFromMem(imdata, imbytes);
409 LEPT_FREE(imdata);
410 ptraAdd(pa_data, ba);
411 }
412 ptraGetActualCount(pa_data, &npages);
413 if (npages == 0) {
414 L_ERROR("no pdf files made\n", procName);
415 ptraDestroy(&pa_data, FALSE, FALSE);
416 return 1;
417 }
418
419 /* Concatenate them */
420 fprintf(stderr, "\nconcatenating ... ");
421 ret = ptraConcatenatePdfToData(pa_data, NULL, pdata, pnbytes);
422 fprintf(stderr, "done\n");
423
424 ptraGetActualCount(pa_data, &npages); /* recalculate in case it changes */
425 for (i = 0; i < npages; i++) {
426 ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
427 l_byteaDestroy(&ba);
428 }
429 ptraDestroy(&pa_data, FALSE, FALSE);
430 return ret;
431 }
432
433
434 /*!
435 * \brief selectDefaultPdfEncoding()
436 *
437 * \param[in] pix
438 * \param[out] ptype L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE
439 *
440 * <pre>
441 * Notes:
442 * (1) This attempts to choose an encoding for the pix that results
443 * in the smallest file, assuming that if jpeg encoded, it will
444 * use quality = 75. The decision is approximate, in that
445 * (a) all colormapped images will be losslessly encoded with
446 * gzip (flate), and (b) an image with less than about 20 colors
447 * is likely to be smaller if flate encoded than if encoded
448 * as a jpeg (dct). For example, an image made by pixScaleToGray3()
449 * will have 10 colors, and flate encoding will give about
450 * twice the compression as jpeg with quality = 75.
451 * </pre>
452 */
453 l_int32
selectDefaultPdfEncoding(PIX * pix,l_int32 * ptype)454 selectDefaultPdfEncoding(PIX *pix,
455 l_int32 *ptype)
456 {
457 l_int32 w, h, d, factor, ncolors;
458 PIXCMAP *cmap;
459
460 PROCNAME("selectDefaultPdfEncoding");
461
462 if (!pix)
463 return ERROR_INT("pix not defined", procName, 1);
464 if (!ptype)
465 return ERROR_INT("&type not defined", procName, 1);
466 *ptype = L_FLATE_ENCODE; /* default universal encoding */
467 pixGetDimensions(pix, &w, &h, &d);
468 cmap = pixGetColormap(pix);
469 if (d == 8 && !cmap) {
470 factor = L_MAX(1, (l_int32)sqrt((l_float64)(w * h) / 20000.));
471 pixNumColors(pix, factor, &ncolors);
472 if (ncolors < 20)
473 *ptype = L_FLATE_ENCODE;
474 else
475 *ptype = L_JPEG_ENCODE;
476 } else if (d == 1) {
477 *ptype = L_G4_ENCODE;
478 } else if (cmap || d == 2 || d == 4) {
479 *ptype = L_FLATE_ENCODE;
480 } else if (d == 8 || d == 32) {
481 *ptype = L_JPEG_ENCODE;
482 } else {
483 return ERROR_INT("type selection failure", procName, 1);
484 }
485
486 return 0;
487 }
488
489
490 /*---------------------------------------------------------------------*
491 * Convert specified image files to pdf without scaling *
492 *---------------------------------------------------------------------*/
493 /*!
494 * \brief convertUnscaledFilesToPdf()
495 *
496 * \param[in] dirname directory name containing images
497 * \param[in] substr [optional] substring filter on filenames; can be NULL
498 * \param[in] title [optional] pdf title; if null, taken from the first
499 * image filename
500 * \param[in] fileout pdf file of all images
501 * \return 0 if OK, 1 on error
502 *
503 * <pre>
504 * Notes:
505 * (1) If %substr is not NULL, only image filenames that contain
506 * the substring can be used. If %substr == NULL, all files
507 * in the directory are used.
508 * (2) The files in the directory, after optional filtering by
509 * the substring, are lexically sorted in increasing order
510 * before concatenation.
511 * (3) For jpeg and jp2k, this is very fast because the compressed
512 * data is wrapped up and concatenated. For png and tiffg4,
513 * the images must be read and recompressed.
514 * </pre>
515 */
516 l_int32
convertUnscaledFilesToPdf(const char * dirname,const char * substr,const char * title,const char * fileout)517 convertUnscaledFilesToPdf(const char *dirname,
518 const char *substr,
519 const char *title,
520 const char *fileout)
521 {
522 l_int32 ret;
523 SARRAY *sa;
524
525 PROCNAME("convertUnscaledFilesToPdf");
526
527 if (!dirname)
528 return ERROR_INT("dirname not defined", procName, 1);
529 if (!fileout)
530 return ERROR_INT("fileout not defined", procName, 1);
531
532 if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
533 return ERROR_INT("sa not made", procName, 1);
534 ret = saConvertUnscaledFilesToPdf(sa, title, fileout);
535 sarrayDestroy(&sa);
536 return ret;
537 }
538
539
540 /*!
541 * \brief saConvertUnscaledFilesToPdf()
542 *
543 * \param[in] sa string array of pathnames for images
544 * \param[in] title [optional] pdf title; if null, taken from the first
545 * image filename
546 * \param[in] fileout pdf file of all images
547 * \return 0 if OK, 1 on error
548 *
549 * <pre>
550 * Notes:
551 * (1) See convertUnscaledFilesToPdf().
552 * </pre>
553 */
554 l_int32
saConvertUnscaledFilesToPdf(SARRAY * sa,const char * title,const char * fileout)555 saConvertUnscaledFilesToPdf(SARRAY *sa,
556 const char *title,
557 const char *fileout)
558 {
559 l_uint8 *data;
560 l_int32 ret;
561 size_t nbytes;
562
563 PROCNAME("saConvertUnscaledFilesToPdf");
564
565 if (!sa)
566 return ERROR_INT("sa not defined", procName, 1);
567
568 ret = saConvertUnscaledFilesToPdfData(sa, title, &data, &nbytes);
569 if (ret) {
570 if (data) LEPT_FREE(data);
571 return ERROR_INT("pdf data not made", procName, 1);
572 }
573
574 ret = l_binaryWrite(fileout, "w", data, nbytes);
575 LEPT_FREE(data);
576 if (ret)
577 L_ERROR("pdf data not written to file\n", procName);
578 return ret;
579 }
580
581
582 /*!
583 * \brief saConvertUnscaledFilesToPdfData()
584 *
585 * \param[in] sa string array of pathnames for images
586 * \param[in] title [optional] pdf title; if null, taken from the first
587 * image filename
588 * \param[out] pdata output pdf data (of all images)
589 * \param[out] pnbytes size of output pdf data
590 * \return 0 if OK, 1 on error
591 */
592 l_int32
saConvertUnscaledFilesToPdfData(SARRAY * sa,const char * title,l_uint8 ** pdata,size_t * pnbytes)593 saConvertUnscaledFilesToPdfData(SARRAY *sa,
594 const char *title,
595 l_uint8 **pdata,
596 size_t *pnbytes)
597 {
598 char *fname;
599 l_uint8 *imdata;
600 l_int32 i, n, ret, npages;
601 size_t imbytes;
602 L_BYTEA *ba;
603 L_PTRA *pa_data;
604
605 PROCNAME("saConvertUnscaledFilesToPdfData");
606
607 if (!pdata)
608 return ERROR_INT("&data not defined", procName, 1);
609 *pdata = NULL;
610 if (!pnbytes)
611 return ERROR_INT("&nbytes not defined", procName, 1);
612 *pnbytes = 0;
613 if (!sa)
614 return ERROR_INT("sa not defined", procName, 1);
615
616 /* Generate all the encoded pdf strings */
617 n = sarrayGetCount(sa);
618 pa_data = ptraCreate(n);
619 for (i = 0; i < n; i++) {
620 if (i && (i % 10 == 0)) fprintf(stderr, ".. %d ", i);
621 fname = sarrayGetString(sa, i, L_NOCOPY);
622
623 /* Generate the pdf data */
624 if (convertUnscaledToPdfData(fname, title, &imdata, &imbytes))
625 continue;
626
627 /* ... and add it to the array of single page data */
628 ba = l_byteaInitFromMem(imdata, imbytes);
629 if (imdata) LEPT_FREE(imdata);
630 ptraAdd(pa_data, ba);
631 }
632 ptraGetActualCount(pa_data, &npages);
633 if (npages == 0) {
634 L_ERROR("no pdf files made\n", procName);
635 ptraDestroy(&pa_data, FALSE, FALSE);
636 return 1;
637 }
638
639 /* Concatenate to generate a multipage pdf */
640 fprintf(stderr, "\nconcatenating ... ");
641 ret = ptraConcatenatePdfToData(pa_data, NULL, pdata, pnbytes);
642 fprintf(stderr, "done\n");
643
644 /* Clean up */
645 ptraGetActualCount(pa_data, &npages); /* maybe failed to read some files */
646 for (i = 0; i < npages; i++) {
647 ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
648 l_byteaDestroy(&ba);
649 }
650 ptraDestroy(&pa_data, FALSE, FALSE);
651 return ret;
652 }
653
654
655 /*!
656 * \brief convertUnscaledToPdfData()
657 *
658 * \param[in] fname of image file
659 * \param[in] title [optional] pdf title; can be NULL
660 * \param[out] pdata output pdf data for image
661 * \param[out] pnbytes size of output pdf data
662 * \return 0 if OK, 1 on error
663 */
664 l_int32
convertUnscaledToPdfData(const char * fname,const char * title,l_uint8 ** pdata,size_t * pnbytes)665 convertUnscaledToPdfData(const char *fname,
666 const char *title,
667 l_uint8 **pdata,
668 size_t *pnbytes)
669 {
670 const char *pdftitle = NULL;
671 char *tail = NULL;
672 l_int32 format;
673 L_COMP_DATA *cid;
674
675 PROCNAME("convertUnscaledToPdfData");
676
677 if (!pdata)
678 return ERROR_INT("&data not defined", procName, 1);
679 *pdata = NULL;
680 if (!pnbytes)
681 return ERROR_INT("&nbytes not defined", procName, 1);
682 *pnbytes = 0;
683 if (!fname)
684 return ERROR_INT("fname not defined", procName, 1);
685
686 findFileFormat(fname, &format);
687 if (format == IFF_UNKNOWN) {
688 L_WARNING("file %s format is unknown; skip\n", procName, fname);
689 return 1;
690 }
691 if (format == IFF_PS || format == IFF_LPDF) {
692 L_WARNING("file %s format is %d; skip\n", procName, fname, format);
693 return 1;
694 }
695
696 /* Generate the image data required for pdf generation, always
697 * in binary (not ascii85) coding; jpeg files are never transcoded. */
698 l_generateCIDataForPdf(fname, NULL, 0, &cid);
699 if (!cid) {
700 L_ERROR("file %s format is %d; unreadable\n", procName, fname, format);
701 return 1;
702 }
703
704 /* If %title == NULL, use the tail of %fname. */
705 if (title) {
706 pdftitle = title;
707 } else {
708 splitPathAtDirectory(fname, NULL, &tail);
709 pdftitle = tail;
710 }
711
712 /* Generate the pdf string for this page (image). This destroys
713 * the cid by attaching it to an lpd and destroying the lpd. */
714 cidConvertToPdfData(cid, pdftitle, pdata, pnbytes);
715 LEPT_FREE(tail);
716 return 0;
717 }
718
719
720 /*---------------------------------------------------------------------*
721 * Convert multiple images to pdf (one image per page) *
722 *---------------------------------------------------------------------*/
723 /*!
724 * \brief pixaConvertToPdf()
725 *
726 * \param[in] pixa containing images all at the same resolution
727 * \param[in] res override the resolution of each input image, in ppi;
728 * use 0 to respect the resolution embedded in the input
729 * \param[in] scalefactor scaling factor applied to each image; > 0.0
730 * \param[in] type encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
731 * L_FLATE_ENCODE, or 0 for default
732 * \param[in] quality used for JPEG only; 0 for default (75)
733 * \param[in] title [optional] pdf title
734 * \param[in] fileout pdf file of all images
735 * \return 0 if OK, 1 on error
736 *
737 * <pre>
738 * Notes:
739 * (1) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without
740 * colormap and many colors, or 32 bpp; FLATE for anything else.
741 * (2) The scalefactor must be > 0.0; otherwise it is set to 1.0.
742 * (3) Specifying one of the three encoding types for %type forces
743 * all images to be compressed with that type. Use 0 to have
744 * the type determined for each image based on depth and whether
745 * or not it has a colormap.
746 * </pre>
747 */
748 l_int32
pixaConvertToPdf(PIXA * pixa,l_int32 res,l_float32 scalefactor,l_int32 type,l_int32 quality,const char * title,const char * fileout)749 pixaConvertToPdf(PIXA *pixa,
750 l_int32 res,
751 l_float32 scalefactor,
752 l_int32 type,
753 l_int32 quality,
754 const char *title,
755 const char *fileout)
756 {
757 l_uint8 *data;
758 l_int32 ret;
759 size_t nbytes;
760
761 PROCNAME("pixaConvertToPdf");
762
763 if (!pixa)
764 return ERROR_INT("pixa not defined", procName, 1);
765
766 ret = pixaConvertToPdfData(pixa, res, scalefactor, type, quality,
767 title, &data, &nbytes);
768 if (ret) {
769 LEPT_FREE(data);
770 return ERROR_INT("conversion to pdf failed", procName, 1);
771 }
772
773 ret = l_binaryWrite(fileout, "w", data, nbytes);
774 LEPT_FREE(data);
775 if (ret)
776 L_ERROR("pdf data not written to file\n", procName);
777 return ret;
778 }
779
780
781 /*!
782 * \brief pixaConvertToPdfData()
783 *
784 * \param[in] pixa containing images all at the same resolution
785 * \param[in] res input resolution of all images
786 * \param[in] scalefactor scaling factor applied to each image; > 0.0
787 * \param[in] type encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
788 * L_FLATE_ENCODE, or 0 for default
789 * \param[in] quality used for JPEG only; 0 for default (75)
790 * \param[in] title [optional] pdf title
791 * \param[out] pdata output pdf data (of all images
792 * \param[out] pnbytes size of output pdf data
793 * \return 0 if OK, 1 on error
794 *
795 * <pre>
796 * Notes:
797 * (1) See pixaConvertToPdf().
798 * </pre>
799 */
800 l_int32
pixaConvertToPdfData(PIXA * pixa,l_int32 res,l_float32 scalefactor,l_int32 type,l_int32 quality,const char * title,l_uint8 ** pdata,size_t * pnbytes)801 pixaConvertToPdfData(PIXA *pixa,
802 l_int32 res,
803 l_float32 scalefactor,
804 l_int32 type,
805 l_int32 quality,
806 const char *title,
807 l_uint8 **pdata,
808 size_t *pnbytes)
809 {
810 l_uint8 *imdata;
811 l_int32 i, n, ret, scaledres, pagetype;
812 size_t imbytes;
813 L_BYTEA *ba;
814 PIX *pixs, *pix;
815 L_PTRA *pa_data;
816
817 PROCNAME("pixaConvertToPdfData");
818
819 if (!pdata)
820 return ERROR_INT("&data not defined", procName, 1);
821 *pdata = NULL;
822 if (!pnbytes)
823 return ERROR_INT("&nbytes not defined", procName, 1);
824 *pnbytes = 0;
825 if (!pixa)
826 return ERROR_INT("pixa not defined", procName, 1);
827 if (scalefactor <= 0.0) scalefactor = 1.0;
828 if (type < 0 || type > L_FLATE_ENCODE) {
829 L_WARNING("invalid compression type; using per-page default\n",
830 procName);
831 type = 0;
832 }
833
834 /* Generate all the encoded pdf strings */
835 n = pixaGetCount(pixa);
836 pa_data = ptraCreate(n);
837 for (i = 0; i < n; i++) {
838 if ((pixs = pixaGetPix(pixa, i, L_CLONE)) == NULL) {
839 L_ERROR("pix[%d] not retrieved\n", procName, i);
840 continue;
841 }
842 if (scalefactor != 1.0)
843 pix = pixScale(pixs, scalefactor, scalefactor);
844 else
845 pix = pixClone(pixs);
846 pixDestroy(&pixs);
847 scaledres = (l_int32)(res * scalefactor);
848 if (type != 0) {
849 pagetype = type;
850 } else if (selectDefaultPdfEncoding(pix, &pagetype) != 0) {
851 L_ERROR("encoding type selection failed for pix[%d]\n",
852 procName, i);
853 pixDestroy(&pix);
854 continue;
855 }
856 ret = pixConvertToPdfData(pix, pagetype, quality, &imdata, &imbytes,
857 0, 0, scaledres, title, NULL, 0);
858 pixDestroy(&pix);
859 if (ret) {
860 LEPT_FREE(imdata);
861 L_ERROR("pdf encoding failed for pix[%d]\n", procName, i);
862 continue;
863 }
864 ba = l_byteaInitFromMem(imdata, imbytes);
865 LEPT_FREE(imdata);
866 ptraAdd(pa_data, ba);
867 }
868 ptraGetActualCount(pa_data, &n);
869 if (n == 0) {
870 L_ERROR("no pdf files made\n", procName);
871 ptraDestroy(&pa_data, FALSE, FALSE);
872 return 1;
873 }
874
875 /* Concatenate them */
876 ret = ptraConcatenatePdfToData(pa_data, NULL, pdata, pnbytes);
877
878 ptraGetActualCount(pa_data, &n); /* recalculate in case it changes */
879 for (i = 0; i < n; i++) {
880 ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
881 l_byteaDestroy(&ba);
882 }
883 ptraDestroy(&pa_data, FALSE, FALSE);
884 return ret;
885 }
886
887
888 /*---------------------------------------------------------------------*
889 * Single page, multi-image converters *
890 *---------------------------------------------------------------------*/
891 /*!
892 * \brief convertToPdf()
893 *
894 * \param[in] filein input image file -- any format
895 * \param[in] type L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE
896 * \param[in] quality used for JPEG only; 0 for default (75)
897 * \param[in] fileout output pdf file; only required on last image on page
898 * \param[in] x, y location of lower-left corner of image, in pixels,
899 * relative to the PostScript origin (0,0) at
900 * the lower-left corner of the page
901 * \param[in] res override the resolution of the input image, in ppi;
902 * use 0 to respect the resolution embedded in the input
903 * \param[in] title [optional] pdf title; if null, taken from filein
904 * \param[in,out] plpd ptr to lpd, which is created on the first invocation
905 * and returned until last image is processed, at which
906 * time it is destroyed
907 * \param[in] position in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
908 * L_LAST_IMAGE
909 * \return 0 if OK, 1 on error
910 *
911 * <pre>
912 * Notes:
913 * (1) To wrap only one image in pdf, input %plpd = NULL, and
914 * the value of %position will be ignored:
915 * convertToPdf(... type, quality, x, y, res, NULL, 0);
916 * (2) To wrap multiple images on a single pdf page, this is called
917 * once for each successive image. Do it this way:
918 * L_PDF_DATA *lpd;
919 * convertToPdf(... type, quality, x, y, res, &lpd, L_FIRST_IMAGE);
920 * convertToPdf(... type, quality, x, y, res, &lpd, L_NEXT_IMAGE);
921 * ...
922 * convertToPdf(... type, quality, x, y, res, &lpd, L_LAST_IMAGE);
923 * This will write the result to the value of %fileout specified
924 * in the first call; succeeding values of %fileout are ignored.
925 * On the last call: the pdf data bytes are computed and written
926 * to %fileout, lpd is destroyed internally, and the returned
927 * value of lpd is null. So the client has nothing to clean up.
928 * (3) (a) Set %res == 0 to respect the resolution embedded in the
929 * image file. If no resolution is embedded, it will be set
930 * to the default value.
931 * (b) Set %res to some other value to override the file resolution.
932 * (4) (a) If the input %res and the resolution of the output device
933 * are equal, the image will be "displayed" at the same size
934 * as the original.
935 * (b) If the input %res is 72, the output device will render
936 * the image at 1 pt/pixel.
937 * (c) Some possible choices for the default input pix resolution are:
938 * 72 ppi Render pix on any output device at one pt/pixel
939 * 96 ppi Windows default for generated display images
940 * 300 ppi Typical default for scanned images.
941 * We choose 300, which is sensible for rendering page images.
942 * However, images come from a variety of sources, and
943 * some are explicitly created for viewing on a display.
944 * </pre>
945 */
946 l_int32
convertToPdf(const char * filein,l_int32 type,l_int32 quality,const char * fileout,l_int32 x,l_int32 y,l_int32 res,const char * title,L_PDF_DATA ** plpd,l_int32 position)947 convertToPdf(const char *filein,
948 l_int32 type,
949 l_int32 quality,
950 const char *fileout,
951 l_int32 x,
952 l_int32 y,
953 l_int32 res,
954 const char *title,
955 L_PDF_DATA **plpd,
956 l_int32 position)
957 {
958 l_uint8 *data;
959 l_int32 ret;
960 size_t nbytes;
961
962 PROCNAME("convertToPdf");
963
964 if (!filein)
965 return ERROR_INT("filein not defined", procName, 1);
966 if (!plpd || (position == L_LAST_IMAGE)) {
967 if (!fileout)
968 return ERROR_INT("fileout not defined", procName, 1);
969 }
970 if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
971 type != L_FLATE_ENCODE)
972 return ERROR_INT("invalid conversion type", procName, 1);
973
974 if (convertToPdfData(filein, type, quality, &data, &nbytes, x, y,
975 res, title, plpd, position))
976 return ERROR_INT("pdf data not made", procName, 1);
977
978 if (!plpd || (position == L_LAST_IMAGE)) {
979 ret = l_binaryWrite(fileout, "w", data, nbytes);
980 LEPT_FREE(data);
981 if (ret)
982 return ERROR_INT("pdf data not written to file", procName, 1);
983 }
984
985 return 0;
986 }
987
988
989 /*!
990 * \brief convertImageDataToPdf()
991 *
992 * \param[in] imdata array of formatted image data; e.g., png, jpeg
993 * \param[in] size size of image data
994 * \param[in] type L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE
995 * \param[in] quality used for JPEG only; 0 for default (75)
996 * \param[in] fileout output pdf file; only required on last image on page
997 * \param[in] x, y location of lower-left corner of image, in pixels,
998 * relative to the PostScript origin (0,0) at
999 * the lower-left corner of the page
1000 * \param[in] res override the resolution of the input image, in ppi;
1001 * use 0 to respect the resolution embedded in the input
1002 * \param[in] title [optional] pdf title
1003 * \param[in,out] plpd ptr to lpd, which is created on the first invocation
1004 * and returned until last image is processed, at which
1005 * time it is destroyed
1006 * \param[in] position in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
1007 * L_LAST_IMAGE
1008 * \return 0 if OK, 1 on error
1009 *
1010 * <pre>
1011 * Notes:
1012 * (1) If %res == 0 and the input resolution field is 0,
1013 * this will use DEFAULT_INPUT_RES.
1014 * (2) See comments in convertToPdf().
1015 * </pre>
1016 */
1017 l_int32
convertImageDataToPdf(l_uint8 * imdata,size_t size,l_int32 type,l_int32 quality,const char * fileout,l_int32 x,l_int32 y,l_int32 res,const char * title,L_PDF_DATA ** plpd,l_int32 position)1018 convertImageDataToPdf(l_uint8 *imdata,
1019 size_t size,
1020 l_int32 type,
1021 l_int32 quality,
1022 const char *fileout,
1023 l_int32 x,
1024 l_int32 y,
1025 l_int32 res,
1026 const char *title,
1027 L_PDF_DATA **plpd,
1028 l_int32 position)
1029 {
1030 l_int32 ret;
1031 PIX *pix;
1032
1033 PROCNAME("convertImageDataToPdf");
1034
1035 if (!imdata)
1036 return ERROR_INT("image data not defined", procName, 1);
1037 if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
1038 type != L_FLATE_ENCODE)
1039 return ERROR_INT("invalid conversion type", procName, 1);
1040 if (!plpd || (position == L_LAST_IMAGE)) {
1041 if (!fileout)
1042 return ERROR_INT("fileout not defined", procName, 1);
1043 }
1044
1045 if ((pix = pixReadMem(imdata, size)) == NULL)
1046 return ERROR_INT("pix not read", procName, 1);
1047 ret = pixConvertToPdf(pix, type, quality, fileout, x, y, res,
1048 title, plpd, position);
1049 pixDestroy(&pix);
1050 return ret;
1051 }
1052
1053
1054 /*!
1055 * \brief convertToPdfData()
1056 *
1057 * \param[in] filein input image file -- any format
1058 * \param[in] type L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE
1059 * \param[in] quality used for JPEG only; 0 for default (75)
1060 * \param[out] pdata pdf data in memory
1061 * \param[out] pnbytes number of bytes in pdf data
1062 * \param[in] x, y location of lower-left corner of image, in pixels,
1063 * relative to the PostScript origin (0,0) at
1064 * the lower-left corner of the page
1065 * \param[in] res override the resolution of the input image, in ppi;
1066 * use 0 to respect the resolution embedded in the input
1067 * \param[in] title [optional] pdf title; if null, use filein
1068 * \param[in,out] plpd ptr to lpd, which is created on the first invocation
1069 * and returned until last image is processed, at which
1070 * time it is destroyed
1071 * \param[in] position in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
1072 * L_LAST_IMAGE
1073 * \return 0 if OK, 1 on error
1074 *
1075 * <pre>
1076 * Notes:
1077 * (1) If %res == 0 and the input resolution field is 0,
1078 * this will use DEFAULT_INPUT_RES.
1079 * (2) See comments in convertToPdf().
1080 * </pre>
1081 */
1082 l_int32
convertToPdfData(const char * filein,l_int32 type,l_int32 quality,l_uint8 ** pdata,size_t * pnbytes,l_int32 x,l_int32 y,l_int32 res,const char * title,L_PDF_DATA ** plpd,l_int32 position)1083 convertToPdfData(const char *filein,
1084 l_int32 type,
1085 l_int32 quality,
1086 l_uint8 **pdata,
1087 size_t *pnbytes,
1088 l_int32 x,
1089 l_int32 y,
1090 l_int32 res,
1091 const char *title,
1092 L_PDF_DATA **plpd,
1093 l_int32 position)
1094 {
1095 PIX *pix;
1096
1097 PROCNAME("convertToPdfData");
1098
1099 if (!pdata)
1100 return ERROR_INT("&data not defined", procName, 1);
1101 *pdata = NULL;
1102 if (!pnbytes)
1103 return ERROR_INT("&nbytes not defined", procName, 1);
1104 *pnbytes = 0;
1105 if (!filein)
1106 return ERROR_INT("filein not defined", procName, 1);
1107 if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
1108 type != L_FLATE_ENCODE)
1109 return ERROR_INT("invalid conversion type", procName, 1);
1110
1111 if ((pix = pixRead(filein)) == NULL)
1112 return ERROR_INT("pix not made", procName, 1);
1113
1114 pixConvertToPdfData(pix, type, quality, pdata, pnbytes,
1115 x, y, res, (title) ? title : filein, plpd, position);
1116 pixDestroy(&pix);
1117 return 0;
1118 }
1119
1120
1121 /*!
1122 * \brief convertImageDataToPdfData()
1123 *
1124 * \param[in] imdata array of formatted image data; e.g., png, jpeg
1125 * \param[in] size size of image data
1126 * \param[in] type L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE
1127 * \param[in] quality used for JPEG only; 0 for default (75)
1128 * \param[out] pdata pdf data in memory
1129 * \param[out] pnbytes number of bytes in pdf data
1130 * \param[in] x, y location of lower-left corner of image, in pixels,
1131 * relative to the PostScript origin (0,0) at
1132 * the lower-left corner of the page
1133 * \param[in] res override the resolution of the input image, in ppi;
1134 * use 0 to respect the resolution embedded in the input
1135 * \param[in] title [optional] pdf title
1136 * \param[out] plpd ptr to lpd, which is created on the first invocation
1137 * and returned until last image is processed, at which
1138 * time it is destroyed
1139 * \param[in] position in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
1140 * L_LAST_IMAGE
1141 * \return 0 if OK, 1 on error
1142 *
1143 * <pre>
1144 * Notes:
1145 * (1) If %res == 0 and the input resolution field is 0,
1146 * this will use DEFAULT_INPUT_RES.
1147 * (2) See comments in convertToPdf().
1148 * </pre>
1149 */
1150 l_int32
convertImageDataToPdfData(l_uint8 * imdata,size_t size,l_int32 type,l_int32 quality,l_uint8 ** pdata,size_t * pnbytes,l_int32 x,l_int32 y,l_int32 res,const char * title,L_PDF_DATA ** plpd,l_int32 position)1151 convertImageDataToPdfData(l_uint8 *imdata,
1152 size_t size,
1153 l_int32 type,
1154 l_int32 quality,
1155 l_uint8 **pdata,
1156 size_t *pnbytes,
1157 l_int32 x,
1158 l_int32 y,
1159 l_int32 res,
1160 const char *title,
1161 L_PDF_DATA **plpd,
1162 l_int32 position)
1163 {
1164 l_int32 ret;
1165 PIX *pix;
1166
1167 PROCNAME("convertImageDataToPdfData");
1168
1169 if (!pdata)
1170 return ERROR_INT("&data not defined", procName, 1);
1171 *pdata = NULL;
1172 if (!pnbytes)
1173 return ERROR_INT("&nbytes not defined", procName, 1);
1174 *pnbytes = 0;
1175 if (!imdata)
1176 return ERROR_INT("image data not defined", procName, 1);
1177 if (plpd) { /* part of multi-page invocation */
1178 if (position == L_FIRST_IMAGE)
1179 *plpd = NULL;
1180 }
1181
1182 if ((pix = pixReadMem(imdata, size)) == NULL)
1183 return ERROR_INT("pix not read", procName, 1);
1184 ret = pixConvertToPdfData(pix, type, quality, pdata, pnbytes,
1185 x, y, res, title, plpd, position);
1186 pixDestroy(&pix);
1187 return ret;
1188 }
1189
1190
1191 /*!
1192 * \brief pixConvertToPdf()
1193 *
1194 * \param[in] pix
1195 * \param[in] type L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE
1196 * \param[in] quality used for JPEG only; 0 for default (75)
1197 * \param[in] fileout output pdf file; only required on last image on page
1198 * \param[in] x, y location of lower-left corner of image, in pixels,
1199 * relative to the PostScript origin (0,0 at
1200 * the lower-left corner of the page)
1201 * \param[in] res override the resolution of the input image, in ppi;
1202 * use 0 to respect the resolution embedded in the input
1203 * \param[in] title [optional] pdf title
1204 * \param[in,out] plpd ptr to lpd, which is created on the first invocation
1205 * and returned until last image is processed
1206 * \param[in] position in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
1207 * L_LAST_IMAGE
1208 * \return 0 if OK, 1 on error
1209 *
1210 * <pre>
1211 * Notes:
1212 * (1) If %res == 0 and the input resolution field is 0,
1213 * this will use DEFAULT_INPUT_RES.
1214 * (2) This only writes data to fileout if it is the last
1215 * image to be written on the page.
1216 * (3) See comments in convertToPdf().
1217 * </pre>
1218 */
1219 l_int32
pixConvertToPdf(PIX * pix,l_int32 type,l_int32 quality,const char * fileout,l_int32 x,l_int32 y,l_int32 res,const char * title,L_PDF_DATA ** plpd,l_int32 position)1220 pixConvertToPdf(PIX *pix,
1221 l_int32 type,
1222 l_int32 quality,
1223 const char *fileout,
1224 l_int32 x,
1225 l_int32 y,
1226 l_int32 res,
1227 const char *title,
1228 L_PDF_DATA **plpd,
1229 l_int32 position)
1230 {
1231 l_uint8 *data;
1232 l_int32 ret;
1233 size_t nbytes;
1234
1235 PROCNAME("pixConvertToPdf");
1236
1237 if (!pix)
1238 return ERROR_INT("pix not defined", procName, 1);
1239 if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
1240 type != L_FLATE_ENCODE)
1241 return ERROR_INT("invalid conversion type", procName, 1);
1242 if (!plpd || (position == L_LAST_IMAGE)) {
1243 if (!fileout)
1244 return ERROR_INT("fileout not defined", procName, 1);
1245 }
1246
1247 if (pixConvertToPdfData(pix, type, quality, &data, &nbytes,
1248 x, y, res, title, plpd, position)) {
1249 LEPT_FREE(data);
1250 return ERROR_INT("pdf data not made", procName, 1);
1251 }
1252
1253 if (!plpd || (position == L_LAST_IMAGE)) {
1254 ret = l_binaryWrite(fileout, "w", data, nbytes);
1255 LEPT_FREE(data);
1256 if (ret)
1257 return ERROR_INT("pdf data not written to file", procName, 1);
1258 }
1259 return 0;
1260 }
1261
1262
1263 /*!
1264 * \brief pixWriteStreamPdf()
1265 *
1266 * \param[in] fp file stream opened for writing
1267 * \param[in] pix all depths, cmap OK
1268 * \param[in] res override the resolution of the input image, in ppi;
1269 * use 0 to respect the resolution embedded in the input
1270 * \param[in] title [optional] pdf title; taken from the first image
1271 * placed on a page; e.g., an input image filename
1272 * \return 0 if OK, 1 on error
1273 *
1274 * <pre>
1275 * Notes:
1276 * (1) This is the simplest interface for writing a single image
1277 * with pdf encoding to a stream. It uses G4 encoding for 1 bpp,
1278 * JPEG encoding for 8 bpp (no cmap) and 32 bpp, and FLATE
1279 * encoding for everything else.
1280 * </pre>
1281 */
1282 l_int32
pixWriteStreamPdf(FILE * fp,PIX * pix,l_int32 res,const char * title)1283 pixWriteStreamPdf(FILE *fp,
1284 PIX *pix,
1285 l_int32 res,
1286 const char *title)
1287 {
1288 l_uint8 *data;
1289 size_t nbytes, nbytes_written;
1290
1291 PROCNAME("pixWriteStreamPdf");
1292
1293 if (!fp)
1294 return ERROR_INT("stream not opened", procName, 1);
1295 if (!pix)
1296 return ERROR_INT("pix not defined", procName, 1);
1297
1298 if (pixWriteMemPdf(&data, &nbytes, pix, res, title) != 0) {
1299 LEPT_FREE(data);
1300 return ERROR_INT("pdf data not made", procName, 1);
1301 }
1302
1303 nbytes_written = fwrite(data, 1, nbytes, fp);
1304 LEPT_FREE(data);
1305 if (nbytes != nbytes_written)
1306 return ERROR_INT("failure writing pdf data to stream", procName, 1);
1307 return 0;
1308 }
1309
1310
1311 /*!
1312 * \brief pixWriteMemPdf()
1313 *
1314 * \param[out] pdata pdf as byte array
1315 * \param[out] pnbytes number of bytes in pdf array
1316 * \param[in] pix all depths, cmap OK
1317 * \param[in] res override the resolution of the input image, in ppi;
1318 * use 0 to respect the resolution embedded in the input
1319 * \param[in] title [optional] pdf title; taken from the first image
1320 * placed on a page; e.g., an input image filename
1321 * \return 0 if OK, 1 on error
1322 *
1323 * <pre>
1324 * Notes:
1325 * (1) This is the simplest interface for writing a single image
1326 * with pdf encoding to memory. It uses G4 encoding for 1 bpp,
1327 * JPEG encoding for 8 bpp (no cmap) and 32 bpp, and FLATE
1328 * encoding for everything else.
1329 * </pre>
1330 */
1331 l_int32
pixWriteMemPdf(l_uint8 ** pdata,size_t * pnbytes,PIX * pix,l_int32 res,const char * title)1332 pixWriteMemPdf(l_uint8 **pdata,
1333 size_t *pnbytes,
1334 PIX *pix,
1335 l_int32 res,
1336 const char *title)
1337 {
1338 l_int32 ret, d, type;
1339 PIXCMAP *cmap;
1340
1341 PROCNAME("pixWriteMemPdf");
1342
1343 if (pdata) *pdata = NULL;
1344 if (pnbytes) *pnbytes = 0;
1345 if (!pdata || !pnbytes)
1346 return ERROR_INT("&data or &nbytes not defined", procName, 1);
1347 if (!pix)
1348 return ERROR_INT("pix not defined", procName, 1);
1349
1350 d = pixGetDepth(pix);
1351 cmap = pixGetColormap(pix);
1352 if (d == 1)
1353 type = L_G4_ENCODE;
1354 else if (cmap || d == 2 || d == 4 || d == 16)
1355 type = L_FLATE_ENCODE;
1356 else /* d == 8 (no cmap) or d == 32 */
1357 type = L_JPEG_ENCODE;
1358
1359 ret = pixConvertToPdfData(pix, type, 75, pdata, pnbytes,
1360 0, 0, res, title, NULL, 0);
1361 if (ret)
1362 return ERROR_INT("pdf data not made", procName, 1);
1363 return 0;
1364 }
1365
1366
1367 /*---------------------------------------------------------------------*
1368 * Segmented multi-page, multi-image converter *
1369 *---------------------------------------------------------------------*/
1370 /*!
1371 * \brief convertSegmentedFilesToPdf()
1372 *
1373 * \param[in] dirname directory name containing images
1374 * \param[in] substr [optional] substring filter on filenames; can be NULL
1375 * \param[in] res input resolution of all images
1376 * \param[in] type compression type for non-image regions; the
1377 * image regions are always compressed with L_JPEG_ENCODE
1378 * \param[in] thresh used for converting gray --> 1 bpp with L_G4_ENCODE
1379 * \param[in] baa [optional] boxaa of image regions
1380 * \param[in] quality used for JPEG only; 0 for default (75)
1381 * \param[in] scalefactor scaling factor applied to each image region
1382 * \param[in] title [optional] pdf title; if null, taken from the first
1383 * image filename
1384 * \param[in] fileout pdf file of all images
1385 * \return 0 if OK, 1 on error
1386 *
1387 * <pre>
1388 * Notes:
1389 * (1) If %substr is not NULL, only image filenames that contain
1390 * the substring can be used. If %substr == NULL, all files
1391 * in the directory are used.
1392 * (2) The files in the directory, after optional filtering by
1393 * the substring, are lexically sorted in increasing order
1394 * before concatenation.
1395 * (3) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without
1396 * colormap and many colors, or 32 bpp; FLATE for anything else.
1397 * (4) The boxaa, if it exists, contains one boxa of "image regions"
1398 * for each image file. The boxa must be aligned with the
1399 * sorted set of images.
1400 * (5) The scalefactor is applied to each image region. It is
1401 * typically < 1.0, to save bytes in the final pdf, because
1402 * the resolution is often not critical in non-text regions.
1403 * (6) If the non-image regions have pixel depth > 1 and the encoding
1404 * type is G4, they are automatically scaled up by 2x and
1405 * thresholded. Otherwise, no scaling is performed on them.
1406 * (7) Note that this function can be used to generate multipage
1407 * G4 compressed pdf from any input, by using %boxaa == NULL
1408 * and %type == L_G4_ENCODE.
1409 * </pre>
1410 */
1411 l_int32
convertSegmentedFilesToPdf(const char * dirname,const char * substr,l_int32 res,l_int32 type,l_int32 thresh,BOXAA * baa,l_int32 quality,l_float32 scalefactor,const char * title,const char * fileout)1412 convertSegmentedFilesToPdf(const char *dirname,
1413 const char *substr,
1414 l_int32 res,
1415 l_int32 type,
1416 l_int32 thresh,
1417 BOXAA *baa,
1418 l_int32 quality,
1419 l_float32 scalefactor,
1420 const char *title,
1421 const char *fileout)
1422 {
1423 char *fname;
1424 l_uint8 *imdata, *data;
1425 l_int32 i, npages, nboxa, nboxes, ret;
1426 size_t imbytes, databytes;
1427 BOXA *boxa;
1428 L_BYTEA *ba;
1429 L_PTRA *pa_data;
1430 SARRAY *sa;
1431
1432 PROCNAME("convertSegmentedFilesToPdf");
1433
1434 if (!dirname)
1435 return ERROR_INT("dirname not defined", procName, 1);
1436 if (!fileout)
1437 return ERROR_INT("fileout not defined", procName, 1);
1438
1439 if ((sa = getNumberedPathnamesInDirectory(dirname, substr, 0, 0, 10000))
1440 == NULL)
1441 return ERROR_INT("sa not made", procName, 1);
1442
1443 npages = sarrayGetCount(sa);
1444 /* If necessary, extend the boxaa, which is page-aligned with
1445 * the image files, to be as large as the set of images. */
1446 if (baa) {
1447 nboxa = boxaaGetCount(baa);
1448 if (nboxa < npages) {
1449 boxa = boxaCreate(1);
1450 boxaaExtendWithInit(baa, npages, boxa);
1451 boxaDestroy(&boxa);
1452 }
1453 }
1454
1455 /* Generate and save all the encoded pdf strings */
1456 pa_data = ptraCreate(npages);
1457 for (i = 0; i < npages; i++) {
1458 fname = sarrayGetString(sa, i, L_NOCOPY);
1459 if (!strcmp(fname, "")) continue;
1460 boxa = NULL;
1461 if (baa) {
1462 boxa = boxaaGetBoxa(baa, i, L_CLONE);
1463 nboxes = boxaGetCount(boxa);
1464 if (nboxes == 0)
1465 boxaDestroy(&boxa);
1466 }
1467 ret = convertToPdfDataSegmented(fname, res, type, thresh, boxa,
1468 quality, scalefactor, title,
1469 &imdata, &imbytes);
1470 boxaDestroy(&boxa); /* safe; in case nboxes > 0 */
1471 if (ret) {
1472 L_ERROR("pdf encoding failed for %s\n", procName, fname);
1473 continue;
1474 }
1475 ba = l_byteaInitFromMem(imdata, imbytes);
1476 if (imdata) LEPT_FREE(imdata);
1477 ptraAdd(pa_data, ba);
1478 }
1479 sarrayDestroy(&sa);
1480
1481 ptraGetActualCount(pa_data, &npages);
1482 if (npages == 0) {
1483 L_ERROR("no pdf files made\n", procName);
1484 ptraDestroy(&pa_data, FALSE, FALSE);
1485 return 1;
1486 }
1487
1488 /* Concatenate */
1489 ret = ptraConcatenatePdfToData(pa_data, NULL, &data, &databytes);
1490
1491 /* Clean up */
1492 ptraGetActualCount(pa_data, &npages); /* recalculate in case it changes */
1493 for (i = 0; i < npages; i++) {
1494 ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
1495 l_byteaDestroy(&ba);
1496 }
1497 ptraDestroy(&pa_data, FALSE, FALSE);
1498
1499 if (ret) {
1500 if (data) LEPT_FREE(data);
1501 return ERROR_INT("pdf data not made", procName, 1);
1502 }
1503
1504 ret = l_binaryWrite(fileout, "w", data, databytes);
1505 LEPT_FREE(data);
1506 if (ret)
1507 L_ERROR("pdf data not written to file\n", procName);
1508 return ret;
1509 }
1510
1511
1512 /*!
1513 * \brief convertNumberedMasksToBoxaa()
1514 *
1515 * \param[in] dirname directory name containing mask images
1516 * \param[in] substr [optional] substring filter on filenames; can be NULL
1517 * \param[in] numpre number of characters in name before number
1518 * \param[in] numpost number of characters in name after number, up
1519 * to a dot before an extension
1520 * including an extension and the dot separator
1521 * \return boxaa of mask regions, or NULL on error
1522 *
1523 * <pre>
1524 * Notes:
1525 * (1) This is conveniently used to generate the input boxaa
1526 * for convertSegmentedFilesToPdf(). It guarantees that the
1527 * boxa will be aligned with the page images, even if some
1528 * of the boxa are empty.
1529 * </pre>
1530 */
1531 BOXAA *
convertNumberedMasksToBoxaa(const char * dirname,const char * substr,l_int32 numpre,l_int32 numpost)1532 convertNumberedMasksToBoxaa(const char *dirname,
1533 const char *substr,
1534 l_int32 numpre,
1535 l_int32 numpost)
1536 {
1537 char *fname;
1538 l_int32 i, n;
1539 BOXA *boxa;
1540 BOXAA *baa;
1541 PIX *pix;
1542 SARRAY *sa;
1543
1544 PROCNAME("convertNumberedMasksToBoxaa");
1545
1546 if (!dirname)
1547 return (BOXAA *)ERROR_PTR("dirname not defined", procName, NULL);
1548
1549 if ((sa = getNumberedPathnamesInDirectory(dirname, substr, numpre,
1550 numpost, 10000)) == NULL)
1551 return (BOXAA *)ERROR_PTR("sa not made", procName, NULL);
1552
1553 /* Generate and save all the encoded pdf strings */
1554 n = sarrayGetCount(sa);
1555 baa = boxaaCreate(n);
1556 boxa = boxaCreate(1);
1557 boxaaInitFull(baa, boxa);
1558 boxaDestroy(&boxa);
1559 for (i = 0; i < n; i++) {
1560 fname = sarrayGetString(sa, i, L_NOCOPY);
1561 if (!strcmp(fname, "")) continue;
1562 if ((pix = pixRead(fname)) == NULL) {
1563 L_WARNING("invalid image on page %d\n", procName, i);
1564 continue;
1565 }
1566 boxa = pixConnComp(pix, NULL, 8);
1567 boxaaReplaceBoxa(baa, i, boxa);
1568 pixDestroy(&pix);
1569 }
1570
1571 sarrayDestroy(&sa);
1572 return baa;
1573 }
1574
1575
1576 /*---------------------------------------------------------------------*
1577 * Segmented single page, multi-image converters *
1578 *---------------------------------------------------------------------*/
1579 /*!
1580 * \brief convertToPdfSegmented()
1581 *
1582 * \param[in] filein input image file -- any format
1583 * \param[in] res input image resolution; typ. 300 ppi; use 0 for default
1584 * \param[in] type compression type for non-image regions; the
1585 * image regions are always compressed with L_JPEG_ENCODE
1586 * \param[in] thresh used for converting gray --> 1 bpp with L_G4_ENCODE
1587 * \param[in] boxa [optional] of image regions; can be null
1588 * \param[in] quality used for jpeg image regions; 0 for default
1589 * \param[in] scalefactor used for jpeg regions; must be <= 1.0
1590 * \param[in] title [optional] pdf title; typically taken from the
1591 * input file for the pix
1592 * \param[in] fileout output pdf file
1593 * \return 0 if OK, 1 on error
1594 *
1595 * <pre>
1596 * Notes:
1597 * (1) If there are no image regions, set %boxa == NULL;
1598 * %quality and %scalefactor are ignored.
1599 * (2) Typically, %scalefactor is < 1.0, because the image regions
1600 * can be rendered at a lower resolution (for better compression)
1601 * than the text regions. If %scalefactor == 0, we use 1.0.
1602 * If the input image is 1 bpp and scalefactor < 1.0, we
1603 * use scaleToGray() to downsample the image regions to gray
1604 * before compressing them.
1605 * (3) If the compression type for non-image regions is L_G4_ENCODE
1606 * and bpp > 1, the image is upscaled 2x and thresholded
1607 * to 1 bpp. That is the only situation where %thresh is used.
1608 * (4) The parameter %quality is only used for image regions.
1609 * If %type == L_JPEG_ENCODE, default jpeg quality (75) is
1610 * used for the non-image regions.
1611 * (5) Processing matrix for non-image regions.
1612 *
1613 * Input G4 JPEG FLATE
1614 * ----------|---------------------------------------------------
1615 * 1 bpp | 1x, 1 bpp 1x flate, 1 bpp 1x, 1 bpp
1616 * |
1617 * cmap | 2x, 1 bpp 1x flate, cmap 1x, cmap
1618 * |
1619 * 2,4 bpp | 2x, 1 bpp 1x flate 1x, 2,4 bpp
1620 * no cmap | 2,4 bpp
1621 * |
1622 * 8,32 bpp | 2x, 1 bpp 1x (jpeg) 1x, 8,32 bpp
1623 * no cmap | 8,32 bpp
1624 *
1625 * Summary:
1626 * (a) if G4 is requested, G4 is used, with 2x upscaling
1627 * for all cases except 1 bpp.
1628 * (b) if JPEG is requested, use flate encoding for all cases
1629 * except 8 bpp without cmap and 32 bpp (rgb).
1630 * (c) if FLATE is requested, use flate with no transformation
1631 * of the raster data.
1632 * (6) Calling options/sequence for these functions:
1633 * file --> file (convertToPdfSegmented)
1634 * pix --> file (pixConvertToPdfSegmented)
1635 * pix --> data (pixConvertToPdfDataSegmented)
1636 * file --> data (convertToPdfDataSegmented)
1637 * pix --> data (pixConvertToPdfDataSegmented)
1638 * </pre>
1639 */
1640 l_int32
convertToPdfSegmented(const char * filein,l_int32 res,l_int32 type,l_int32 thresh,BOXA * boxa,l_int32 quality,l_float32 scalefactor,const char * title,const char * fileout)1641 convertToPdfSegmented(const char *filein,
1642 l_int32 res,
1643 l_int32 type,
1644 l_int32 thresh,
1645 BOXA *boxa,
1646 l_int32 quality,
1647 l_float32 scalefactor,
1648 const char *title,
1649 const char *fileout)
1650 {
1651 l_int32 ret;
1652 PIX *pixs;
1653
1654 PROCNAME("convertToPdfSegmented");
1655
1656 if (!filein)
1657 return ERROR_INT("filein not defined", procName, 1);
1658 if (!fileout)
1659 return ERROR_INT("fileout not defined", procName, 1);
1660 if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
1661 type != L_FLATE_ENCODE)
1662 return ERROR_INT("invalid conversion type", procName, 1);
1663 if (boxa && scalefactor > 1.0) {
1664 L_WARNING("setting scalefactor to 1.0\n", procName);
1665 scalefactor = 1.0;
1666 }
1667
1668 if ((pixs = pixRead(filein)) == NULL)
1669 return ERROR_INT("pixs not made", procName, 1);
1670
1671 ret = pixConvertToPdfSegmented(pixs, res, type, thresh, boxa, quality,
1672 scalefactor, (title) ? title : filein,
1673 fileout);
1674 pixDestroy(&pixs);
1675 return ret;
1676 }
1677
1678
1679 /*!
1680 * \brief pixConvertToPdfSegmented()
1681 *
1682 * \param[in] pixs any depth, cmap OK
1683 * \param[in] res input image resolution; typ. 300 ppi; use 0 for default
1684 * \param[in] type compression type for non-image regions; the
1685 * image regions are always compressed with L_JPEG_ENCODE
1686 * \param[in] thresh used for converting gray --> 1 bpp with L_G4_ENCODE
1687 * \param[in] boxa [optional] of image regions; can be null
1688 * \param[in] quality used for jpeg image regions; 0 for default
1689 * \param[in] scalefactor used for jpeg regions; must be <= 1.0
1690 * \param[in] title [optional] pdf title; typically taken from the
1691 * input file for the pix
1692 * \param[in] fileout output pdf file
1693 * \return 0 if OK, 1 on error
1694 *
1695 * <pre>
1696 * Notes:
1697 * (1) See convertToPdfSegmented() for details.
1698 * </pre>
1699 */
1700 l_int32
pixConvertToPdfSegmented(PIX * pixs,l_int32 res,l_int32 type,l_int32 thresh,BOXA * boxa,l_int32 quality,l_float32 scalefactor,const char * title,const char * fileout)1701 pixConvertToPdfSegmented(PIX *pixs,
1702 l_int32 res,
1703 l_int32 type,
1704 l_int32 thresh,
1705 BOXA *boxa,
1706 l_int32 quality,
1707 l_float32 scalefactor,
1708 const char *title,
1709 const char *fileout)
1710 {
1711 l_uint8 *data;
1712 l_int32 ret;
1713 size_t nbytes;
1714
1715 PROCNAME("pixConvertToPdfSegmented");
1716
1717 if (!pixs)
1718 return ERROR_INT("pixs not defined", procName, 1);
1719 if (!fileout)
1720 return ERROR_INT("fileout not defined", procName, 1);
1721 if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
1722 type != L_FLATE_ENCODE)
1723 return ERROR_INT("invalid conversion type", procName, 1);
1724 if (boxa && scalefactor > 1.0) {
1725 L_WARNING("setting scalefactor to 1.0\n", procName);
1726 scalefactor = 1.0;
1727 }
1728
1729 ret = pixConvertToPdfDataSegmented(pixs, res, type, thresh, boxa, quality,
1730 scalefactor, title, &data, &nbytes);
1731 if (ret)
1732 return ERROR_INT("pdf generation failure", procName, 1);
1733
1734 ret = l_binaryWrite(fileout, "w", data, nbytes);
1735 if (data) LEPT_FREE(data);
1736 return ret;
1737 }
1738
1739
1740 /*!
1741 * \brief convertToPdfDataSegmented()
1742 *
1743 * \param[in] filein input image file -- any format
1744 * \param[in] res input image resolution; typ. 300 ppi; use 0 for default
1745 * \param[in] type compression type for non-image regions; the
1746 * image regions are always compressed with L_JPEG_ENCODE
1747 * \param[in] thresh used for converting gray --> 1 bpp with L_G4_ENCODE
1748 * \param[in] boxa [optional] image regions; can be null
1749 * \param[in] quality used for jpeg image regions; 0 for default
1750 * \param[in] scalefactor used for jpeg regions; must be <= 1.0
1751 * \param[in] title [optional] pdf title; if null, uses filein
1752 * \param[out] pdata pdf data in memory
1753 * \param[out] pnbytes number of bytes in pdf data
1754 * \return 0 if OK, 1 on error
1755 *
1756 * <pre>
1757 * Notes:
1758 * (1) If there are no image regions, set %boxa == NULL;
1759 * %quality and %scalefactor are ignored.
1760 * (2) Typically, %scalefactor is < 1.0. The image regions are
1761 * </pre>
1762 */
1763 l_int32
convertToPdfDataSegmented(const char * filein,l_int32 res,l_int32 type,l_int32 thresh,BOXA * boxa,l_int32 quality,l_float32 scalefactor,const char * title,l_uint8 ** pdata,size_t * pnbytes)1764 convertToPdfDataSegmented(const char *filein,
1765 l_int32 res,
1766 l_int32 type,
1767 l_int32 thresh,
1768 BOXA *boxa,
1769 l_int32 quality,
1770 l_float32 scalefactor,
1771 const char *title,
1772 l_uint8 **pdata,
1773 size_t *pnbytes)
1774 {
1775 l_int32 ret;
1776 PIX *pixs;
1777
1778 PROCNAME("convertToPdfDataSegmented");
1779
1780 if (!pdata)
1781 return ERROR_INT("&data not defined", procName, 1);
1782 *pdata = NULL;
1783 if (!pnbytes)
1784 return ERROR_INT("&nbytes not defined", procName, 1);
1785 *pnbytes = 0;
1786 if (!filein)
1787 return ERROR_INT("filein not defined", procName, 1);
1788 if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
1789 type != L_FLATE_ENCODE)
1790 return ERROR_INT("invalid conversion type", procName, 1);
1791 if (boxa && scalefactor > 1.0) {
1792 L_WARNING("setting scalefactor to 1.0\n", procName);
1793 scalefactor = 1.0;
1794 }
1795
1796 if ((pixs = pixRead(filein)) == NULL)
1797 return ERROR_INT("pixs not made", procName, 1);
1798
1799 ret = pixConvertToPdfDataSegmented(pixs, res, type, thresh, boxa,
1800 quality, scalefactor,
1801 (title) ? title : filein,
1802 pdata, pnbytes);
1803 pixDestroy(&pixs);
1804 return ret;
1805 }
1806
1807
1808 /*!
1809 * \brief pixConvertToPdfDataSegmented()
1810 *
1811 * \param[in] pixs any depth, cmap OK
1812 * \param[in] res input image resolution; typ. 300 ppi; use 0 for default
1813 * \param[in] type compression type for non-image regions; the
1814 * image regions are always compressed with L_JPEG_ENCODE
1815 * \param[in] thresh used for converting gray --> 1 bpp with L_G4_ENCODE
1816 * \param[in] boxa [optional] of image regions; can be null
1817 * \param[in] quality used for jpeg image regions; 0 for default
1818 * \param[in] scalefactor used for jpeg regions; must be <= 1.0
1819 * \param[in] title [optional] pdf title; typically taken from the
1820 * input file for the pix
1821 * \param[out] pdata pdf data in memory
1822 * \param[out] pnbytes number of bytes in pdf data
1823 * \return 0 if OK, 1 on error
1824 *
1825 * <pre>
1826 * Notes:
1827 * (1) See convertToPdfSegmented() for details.
1828 * </pre>
1829 */
1830 l_int32
pixConvertToPdfDataSegmented(PIX * pixs,l_int32 res,l_int32 type,l_int32 thresh,BOXA * boxa,l_int32 quality,l_float32 scalefactor,const char * title,l_uint8 ** pdata,size_t * pnbytes)1831 pixConvertToPdfDataSegmented(PIX *pixs,
1832 l_int32 res,
1833 l_int32 type,
1834 l_int32 thresh,
1835 BOXA *boxa,
1836 l_int32 quality,
1837 l_float32 scalefactor,
1838 const char *title,
1839 l_uint8 **pdata,
1840 size_t *pnbytes)
1841 {
1842 l_int32 i, nbox, seq, bx, by, bw, bh, upscale;
1843 l_float32 scale;
1844 BOX *box, *boxc, *box2;
1845 PIX *pix, *pixt1, *pixt2, *pixt3, *pixt4, *pixt5, *pixt6;
1846 PIXCMAP *cmap;
1847 L_PDF_DATA *lpd;
1848
1849 PROCNAME("pixConvertToPdfDataSegmented");
1850
1851 if (!pdata)
1852 return ERROR_INT("&data not defined", procName, 1);
1853 *pdata = NULL;
1854 if (!pnbytes)
1855 return ERROR_INT("&nbytes not defined", procName, 1);
1856 *pnbytes = 0;
1857 if (!pixs)
1858 return ERROR_INT("pixs not defined", procName, 1);
1859 if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
1860 type != L_FLATE_ENCODE)
1861 return ERROR_INT("invalid conversion type", procName, 1);
1862 if (boxa && (scalefactor <= 0.0 || scalefactor > 1.0)) {
1863 L_WARNING("setting scalefactor to 1.0\n", procName);
1864 scalefactor = 1.0;
1865 }
1866
1867 /* Adjust scalefactor so that the product with res gives an integer */
1868 if (res <= 0)
1869 res = DEFAULT_INPUT_RES;
1870 scale = (l_float32)((l_int32)(scalefactor * res + 0.5)) / (l_float32)res;
1871 cmap = pixGetColormap(pixs);
1872
1873 /* Simple case: single image to be encoded */
1874 if (!boxa || boxaGetCount(boxa) == 0) {
1875 if (pixGetDepth(pixs) > 1 && type == L_G4_ENCODE) {
1876 if (cmap)
1877 pixt1 = pixRemoveColormap(pixs, REMOVE_CMAP_TO_GRAYSCALE);
1878 else
1879 pixt1 = pixConvertTo8(pixs, FALSE);
1880 pixt2 = pixScaleGray2xLIThresh(pixt1, thresh);
1881 pixConvertToPdfData(pixt2, type, quality, pdata, pnbytes,
1882 0, 0, 2 * res, title, NULL, 0);
1883 pixDestroy(&pixt1);
1884 pixDestroy(&pixt2);
1885 } else {
1886 pixConvertToPdfData(pixs, type, quality, pdata, pnbytes,
1887 0, 0, res, title, NULL, 0);
1888 }
1889 return 0;
1890 }
1891
1892 /* Multiple images to be encoded. If %type == L_G4_ENCODE,
1893 * jpeg encode a version of pixs that is blanked in the non-image
1894 * regions, and paint the scaled non-image part onto it through a mask.
1895 * Otherwise, we must put the non-image part down first and
1896 * then render all the image regions separately on top of it,
1897 * at their own resolution. */
1898 pixt1 = pixSetBlackOrWhiteBoxa(pixs, boxa, L_SET_WHITE); /* non-image */
1899 nbox = boxaGetCount(boxa);
1900 if (type == L_G4_ENCODE) {
1901 pixt2 = pixCreateTemplate(pixs); /* only image regions */
1902 pixSetBlackOrWhite(pixt2, L_SET_WHITE);
1903 for (i = 0; i < nbox; i++) {
1904 box = boxaGetBox(boxa, i, L_CLONE);
1905 pix = pixClipRectangle(pixs, box, &boxc);
1906 boxGetGeometry(boxc, &bx, &by, &bw, &bh);
1907 pixRasterop(pixt2, bx, by, bw, bh, PIX_SRC, pix, 0, 0);
1908 pixDestroy(&pix);
1909 boxDestroy(&box);
1910 boxDestroy(&boxc);
1911 }
1912 pixt3 = pixRemoveColormap(pixt2, REMOVE_CMAP_BASED_ON_SRC);
1913 if (pixGetDepth(pixt3) == 1)
1914 pixt4 = pixScaleToGray(pixt3, scale);
1915 else
1916 pixt4 = pixScale(pixt3, scale, scale);
1917 pixConvertToPdfData(pixt4, L_JPEG_ENCODE, quality, pdata, pnbytes,
1918 0, 0, (l_int32)(scale * res), title,
1919 &lpd, L_FIRST_IMAGE);
1920
1921 if (pixGetDepth(pixt1) == 1) {
1922 pixt5 = pixClone(pixt1);
1923 upscale = 1;
1924 } else {
1925 pixt6 = pixConvertTo8(pixt1, 0);
1926 pixt5 = pixScaleGray2xLIThresh(pixt6, thresh);
1927 pixDestroy(&pixt6);
1928 upscale = 2;
1929 }
1930 pixConvertToPdfData(pixt5, L_G4_ENCODE, quality, pdata, pnbytes,
1931 0, 0, upscale * res, title, &lpd, L_LAST_IMAGE);
1932 pixDestroy(&pixt2);
1933 pixDestroy(&pixt3);
1934 pixDestroy(&pixt4);
1935 pixDestroy(&pixt5);
1936 } else {
1937 /* Put the non-image part down first. This is the full
1938 size of the page, so we can use it to find the page
1939 height in pixels, which is required for determining
1940 the LL corner of the image relative to the LL corner
1941 of the page. */
1942 pixConvertToPdfData(pixt1, type, quality, pdata, pnbytes, 0, 0,
1943 res, title, &lpd, L_FIRST_IMAGE);
1944 for (i = 0; i < nbox; i++) {
1945 box = boxaGetBox(boxa, i, L_CLONE);
1946 pixt2 = pixClipRectangle(pixs, box, &boxc);
1947 pixt3 = pixRemoveColormap(pixt2, REMOVE_CMAP_BASED_ON_SRC);
1948 if (pixGetDepth(pixt3) == 1)
1949 pixt4 = pixScaleToGray(pixt3, scale);
1950 else
1951 pixt4 = pixScale(pixt3, scale, scale);
1952 box2 = boxTransform(boxc, 0, 0, scale, scale);
1953 boxGetGeometry(box2, &bx, &by, NULL, &bh);
1954 seq = (i == nbox - 1) ? L_LAST_IMAGE : L_NEXT_IMAGE;
1955 pixConvertToPdfData(pixt4, L_JPEG_ENCODE, quality, pdata, pnbytes,
1956 bx, by, (l_int32)(scale * res), title,
1957 &lpd, seq);
1958 pixDestroy(&pixt2);
1959 pixDestroy(&pixt3);
1960 pixDestroy(&pixt4);
1961 boxDestroy(&box);
1962 boxDestroy(&boxc);
1963 boxDestroy(&box2);
1964 }
1965 }
1966
1967 pixDestroy(&pixt1);
1968 return 0;
1969 }
1970
1971
1972 /*---------------------------------------------------------------------*
1973 * Multi-page concatenation *
1974 *---------------------------------------------------------------------*/
1975 /*!
1976 * \brief concatenatePdf()
1977 *
1978 * \param[in] dirname directory name containing single-page pdf files
1979 * \param[in] substr [optional] substring filter on filenames; can be NULL
1980 * \param[in] fileout concatenated pdf file
1981 * \return 0 if OK, 1 on error
1982 *
1983 * <pre>
1984 * Notes:
1985 * (1) This only works with leptonica-formatted single-page pdf files.
1986 * (2) If %substr is not NULL, only filenames that contain
1987 * the substring can be returned. If %substr == NULL,
1988 * none of the filenames are filtered out.
1989 * (3) The files in the directory, after optional filtering by
1990 * the substring, are lexically sorted in increasing order
1991 * before concatenation.
1992 * </pre>
1993 */
1994 l_int32
concatenatePdf(const char * dirname,const char * substr,const char * fileout)1995 concatenatePdf(const char *dirname,
1996 const char *substr,
1997 const char *fileout)
1998 {
1999 l_int32 ret;
2000 SARRAY *sa;
2001
2002 PROCNAME("concatenatePdf");
2003
2004 if (!dirname)
2005 return ERROR_INT("dirname not defined", procName, 1);
2006 if (!fileout)
2007 return ERROR_INT("fileout not defined", procName, 1);
2008
2009 if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
2010 return ERROR_INT("sa not made", procName, 1);
2011 ret = saConcatenatePdf(sa, fileout);
2012 sarrayDestroy(&sa);
2013 return ret;
2014 }
2015
2016
2017 /*!
2018 * \brief saConcatenatePdf()
2019 *
2020 * \param[in] sa string array of pathnames for single-page pdf files
2021 * \param[in] fileout concatenated pdf file
2022 * \return 0 if OK, 1 on error
2023 *
2024 * <pre>
2025 * Notes:
2026 * (1) This only works with leptonica-formatted single-page pdf files.
2027 * </pre>
2028 */
2029 l_int32
saConcatenatePdf(SARRAY * sa,const char * fileout)2030 saConcatenatePdf(SARRAY *sa,
2031 const char *fileout)
2032 {
2033 l_uint8 *data;
2034 l_int32 ret;
2035 size_t nbytes;
2036
2037 PROCNAME("saConcatenatePdf");
2038
2039 if (!sa)
2040 return ERROR_INT("sa not defined", procName, 1);
2041 if (!fileout)
2042 return ERROR_INT("fileout not defined", procName, 1);
2043
2044 ret = saConcatenatePdfToData(sa, &data, &nbytes);
2045 if (ret)
2046 return ERROR_INT("pdf data not made", procName, 1);
2047 ret = l_binaryWrite(fileout, "w", data, nbytes);
2048 LEPT_FREE(data);
2049 return ret;
2050 }
2051
2052
2053 /*!
2054 * \brief ptraConcatenatePdf()
2055 *
2056 * \param[in] pa array of pdf strings, each for a single-page pdf file
2057 * \param[in] fileout concatenated pdf file
2058 * \return 0 if OK, 1 on error
2059 *
2060 * <pre>
2061 * Notes:
2062 * (1) This only works with leptonica-formatted single-page pdf files.
2063 * </pre>
2064 */
2065 l_int32
ptraConcatenatePdf(L_PTRA * pa,const char * fileout)2066 ptraConcatenatePdf(L_PTRA *pa,
2067 const char *fileout)
2068 {
2069 l_uint8 *data;
2070 l_int32 ret;
2071 size_t nbytes;
2072
2073 PROCNAME("ptraConcatenatePdf");
2074
2075 if (!pa)
2076 return ERROR_INT("pa not defined", procName, 1);
2077 if (!fileout)
2078 return ERROR_INT("fileout not defined", procName, 1);
2079
2080 ret = ptraConcatenatePdfToData(pa, NULL, &data, &nbytes);
2081 if (ret)
2082 return ERROR_INT("pdf data not made", procName, 1);
2083 ret = l_binaryWrite(fileout, "w", data, nbytes);
2084 LEPT_FREE(data);
2085 return ret;
2086 }
2087
2088
2089 /*!
2090 * \brief concatenatePdfToData()
2091 *
2092 * \param[in] dirname directory name containing single-page pdf files
2093 * \param[in] substr [optional] substring filter on filenames; can be NULL
2094 * \param[out] pdata concatenated pdf data in memory
2095 * \param[out] pnbytes number of bytes in pdf data
2096 * \return 0 if OK, 1 on error
2097 *
2098 * <pre>
2099 * Notes:
2100 * (1) This only works with leptonica-formatted single-page pdf files.
2101 * (2) If %substr is not NULL, only filenames that contain
2102 * the substring can be returned. If %substr == NULL,
2103 * none of the filenames are filtered out.
2104 * (3) The files in the directory, after optional filtering by
2105 * the substring, are lexically sorted in increasing order
2106 * before concatenation.
2107 * </pre>
2108 */
2109 l_int32
concatenatePdfToData(const char * dirname,const char * substr,l_uint8 ** pdata,size_t * pnbytes)2110 concatenatePdfToData(const char *dirname,
2111 const char *substr,
2112 l_uint8 **pdata,
2113 size_t *pnbytes)
2114 {
2115 l_int32 ret;
2116 SARRAY *sa;
2117
2118 PROCNAME("concatenatePdfToData");
2119
2120 if (!pdata)
2121 return ERROR_INT("&data not defined", procName, 1);
2122 *pdata = NULL;
2123 if (!pnbytes)
2124 return ERROR_INT("&nbytes not defined", procName, 1);
2125 *pnbytes = 0;
2126 if (!dirname)
2127 return ERROR_INT("dirname not defined", procName, 1);
2128
2129 if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
2130 return ERROR_INT("sa not made", procName, 1);
2131 ret = saConcatenatePdfToData(sa, pdata, pnbytes);
2132 sarrayDestroy(&sa);
2133 return ret;
2134 }
2135
2136
2137 /*!
2138 * \brief saConcatenatePdfToData()
2139 *
2140 * \param[in] sa string array of pathnames for single-page pdf files
2141 * \param[out] pdata concatenated pdf data in memory
2142 * \param[out] pnbytes number of bytes in pdf data
2143 * \return 0 if OK, 1 on error
2144 *
2145 * <pre>
2146 * Notes:
2147 * (1) This only works with leptonica-formatted single-page pdf files.
2148 * </pre>
2149 */
2150 l_int32
saConcatenatePdfToData(SARRAY * sa,l_uint8 ** pdata,size_t * pnbytes)2151 saConcatenatePdfToData(SARRAY *sa,
2152 l_uint8 **pdata,
2153 size_t *pnbytes)
2154 {
2155 char *fname;
2156 l_int32 i, npages, ret;
2157 L_BYTEA *bas;
2158 L_PTRA *pa_data; /* input pdf data for each page */
2159
2160 PROCNAME("saConcatenatePdfToData");
2161
2162 if (!pdata)
2163 return ERROR_INT("&data not defined", procName, 1);
2164 *pdata = NULL;
2165 if (!pnbytes)
2166 return ERROR_INT("&nbytes not defined", procName, 1);
2167 *pnbytes = 0;
2168 if (!sa)
2169 return ERROR_INT("sa not defined", procName, 1);
2170
2171 /* Read the pdf files into memory */
2172 if ((npages = sarrayGetCount(sa)) == 0)
2173 return ERROR_INT("no filenames found", procName, 1);
2174 pa_data = ptraCreate(npages);
2175 for (i = 0; i < npages; i++) {
2176 fname = sarrayGetString(sa, i, L_NOCOPY);
2177 bas = l_byteaInitFromFile(fname);
2178 ptraAdd(pa_data, bas);
2179 }
2180
2181 ret = ptraConcatenatePdfToData(pa_data, sa, pdata, pnbytes);
2182
2183 /* Cleanup: some pages could have been removed */
2184 ptraGetActualCount(pa_data, &npages);
2185 for (i = 0; i < npages; i++) {
2186 bas = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
2187 l_byteaDestroy(&bas);
2188 }
2189 ptraDestroy(&pa_data, FALSE, FALSE);
2190 return ret;
2191 }
2192
2193 /* --------------------------------------------*/
2194 #endif /* USE_PDFIO */
2195 /* --------------------------------------------*/
2196