1 /*====================================================================*
2  -  Copyright (C) 2001 Leptonica.  All rights reserved.
3  -
4  -  Redistribution and use in source and binary forms, with or without
5  -  modification, are permitted provided that the following conditions
6  -  are met:
7  -  1. Redistributions of source code must retain the above copyright
8  -     notice, this list of conditions and the following disclaimer.
9  -  2. Redistributions in binary form must reproduce the above
10  -     copyright notice, this list of conditions and the following
11  -     disclaimer in the documentation and/or other materials
12  -     provided with the distribution.
13  -
14  -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15  -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16  -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17  -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
18  -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23  -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *====================================================================*/
26 
27 /*!
28  * \file pdfio1.c
29  * <pre>
30  *
31  *    Higher-level operations for generating pdf.
32  *
33  *    |=============================================================|
34  *    |                         Important note                      |
35  *    |=============================================================|
36  *    | Some of these functions require libtiff, libjpeg, and libz  |
37  *    | If you do not have these libraries, you must set            |
38  *    |      #define  USE_PDFIO     0                               |
39  *    | in environ.h.  This will link pdfiostub.c                   |
40  *    |=============================================================|
41  *
42  *     Set 1. These functions convert a set of image files
43  *     to a multi-page pdf file, with one image on each page.
44  *     All images are rendered at the same (input) resolution.
45  *     The images can be specified as being in a directory, or they
46  *     can be in an sarray.  The output pdf can be either a file
47  *     or an array of bytes in memory.
48  *
49  *     Set 2. These functions are a special case of set 1, where
50  *     no scaling or change in quality is requires.  For jpeg and
51  *     jp2k images, the bytes in each jpeg file can be directly
52  *     incorporated into the output pdf, and the wrapping up of
53  *     multiple image files is very fast.  For non-interlaced png,
54  *     the data bytes including the predictors can also be written
55  *     directly into the flate pdf data.  For other image formats,
56  *     transcoding is required, where the image data is first
57  *     decompressed and then the G4 or Flate (gzip) encodings are generated.
58  *
59  *     Set 3. These functions convert a set of images in memory
60  *     to a multi-page pdf, with one image on each page.  The pdf
61  *     output can be either a file or an array of bytes in memory.
62  *
63  *     Set 4. These functions implement a pdf output "device driver"
64  *     for wrapping (encoding) any number of images on a single page
65  *     in pdf.  The input can be either an image file or a Pix;
66  *     the pdf output can be either a file or an array of bytes in memory.
67  *
68  *     Set 5. These "segmented" functions take a set of image
69  *     files, along with optional segmentation information, and
70  *     generate a multi-page pdf file, where each page consists
71  *     in general of a mixed raster pdf of image and non-image regions.
72  *     The segmentation information for each page can be input as
73  *     either a mask over the image parts, or as a Boxa of those
74  *     regions.
75  *
76  *     Set 6. These "segmented" functions convert an image and
77  *     an optional Boxa of image regions into a mixed raster pdf file
78  *     for the page.  The input image can be either a file or a Pix.
79  *
80  *     Set 7. These functions take a set of single-page pdf files
81  *     and concatenates them into a multi-page pdf.
82  *     The input can be a set of single page pdf files, or of
83  *     pdf 'strings' in memory.  The output can be either a file or
84  *     an array of bytes in memory.
85  *
86  *     The images in the pdf file can be rendered using a pdf viewer,
87  *     such as gv, evince, xpdf or acroread.
88  *
89  *     Reference on the pdf file format:
90  *         http://www.adobe.com/devnet/pdf/pdf_reference_archive.html
91  *
92  *     1. Convert specified image files to pdf (one image file per page)
93  *          l_int32             convertFilesToPdf()
94  *          l_int32             saConvertFilesToPdf()
95  *          l_int32             saConvertFilesToPdfData()
96  *          l_int32             selectDefaultPdfEncoding()
97  *
98  *     2. Convert specified image files to pdf without scaling
99  *          l_int32             convertUnscaledFilesToPdf()
100  *          l_int32             saConvertUnscaledFilesToPdf()
101  *          l_int32             saConvertUnscaledFilesToPdfData()
102  *          l_int32             convertUnscaledToPdfData()
103  *
104  *     3. Convert multiple images to pdf (one image per page)
105  *          l_int32             pixaConvertToPdf()
106  *          l_int32             pixaConvertToPdfData()
107  *
108  *     4. Single page, multi-image converters
109  *          l_int32             convertToPdf()
110  *          l_int32             convertImageDataToPdf()
111  *          l_int32             convertToPdfData()
112  *          l_int32             convertImageDataToPdfData()
113  *          l_int32             pixConvertToPdf()
114  *          l_int32             pixWriteStreamPdf()
115  *          l_int32             pixWriteMemPdf()
116  *
117  *     5. Segmented multi-page, multi-image converter
118  *          l_int32             convertSegmentedFilesToPdf()
119  *          BOXAA              *convertNumberedMasksToBoxaa()
120  *
121  *     6. Segmented single page, multi-image converters
122  *          l_int32             convertToPdfSegmented()
123  *          l_int32             pixConvertToPdfSegmented()
124  *          l_int32             convertToPdfDataSegmented()
125  *          l_int32             pixConvertToPdfDataSegmented()
126  *
127  *     7. Multipage concatenation
128  *          l_int32             concatenatePdf()
129  *          l_int32             saConcatenatePdf()
130  *          l_int32             ptraConcatenatePdf()
131  *          l_int32             concatenatePdfToData()
132  *          l_int32             saConcatenatePdfToData()
133  *
134  *     The top-level multi-image functions can be visualized as follows:
135  *          Output pdf data to file:
136  *             convertToPdf()  and  convertImageDataToPdf()
137  *                     --> pixConvertToPdf()
138  *                           --> pixConvertToPdfData()
139  *
140  *          Output pdf data to array in memory:
141  *             convertToPdfData()  and  convertImageDataToPdfData()
142  *                     --> pixConvertToPdfData()
143  *
144  *     The top-level segmented image functions can be visualized as follows:
145  *          Output pdf data to file:
146  *             convertToPdfSegmented()
147  *                     --> pixConvertToPdfSegmented()
148  *                           --> pixConvertToPdfDataSegmented()
149  *
150  *          Output pdf data to array in memory:
151  *             convertToPdfDataSegmented()
152  *                     --> pixConvertToPdfDataSegmented()
153  *
154  *     For multi-page concatenation, there are three different types of input
155  *        (1) directory and optional filename filter
156  *        (2) sarray of filenames
157  *        (3) ptra of byte arrays of pdf data
158  *     and two types of output for the concatenated pdf data
159  *        (1) filename
160  *        (2) data array and size
161  *     High-level interfaces are given for each of the six combinations.
162  *
163  *     Note: When wrapping small images into pdf, it is useful to give
164  *     them a relatively low resolution value, to avoid rounding errors
165  *     when rendering the images.  For example, if you want an image
166  *     of width w pixels to be 5 inches wide on a screen, choose a
167  *     resolution w/5.
168  *
169  *     The very fast functions in section (2) require neither transcoding
170  *     nor parsing of the compressed jpeg file.  With three types of image
171  *     compression, the compressed strings can be incorporated into
172  *     the pdf data without decompression and re-encoding: jpeg, jp2k
173  *     and png.  The DCTDecode and JPXDecode filters can handle the
174  *     entire jpeg and jp2k encoded string as a byte array in the pdf file.
175  *     The FlateDecode filter can handle the png compressed image data,
176  *     including predictors that occur as the first byte in each
177  *     raster line, but it is necessary to store only the png IDAT chunk
178  *     data in the pdf array.  The alternative for wrapping png images
179  *     is to uncompress into a raster (a pix) and then gzip the raster data.
180  *     This typically results in a larger pdf file, because it doesn't
181  *     use the two-dimensional png predictor.  Colormaps, which are found
182  *     in png PLTE chunks, must always be pulled out and included separately
183  *     in the pdf.  For CCITT-G4 compression, you can not simply
184  *     include a tiff G4 file -- you must either parse it and extract the
185  *     G4 compressed data within it, or uncompress to a raster and
186  *     G4 compress again.
187  * </pre>
188  */
189 
190 #include <string.h>
191 #include <math.h>
192 #include "allheaders.h"
193 
194 /* --------------------------------------------*/
195 #if  USE_PDFIO   /* defined in environ.h */
196  /* --------------------------------------------*/
197 
198     /* Typical scan resolution in ppi (pixels/inch) */
199 static const l_int32  DEFAULT_INPUT_RES = 300;
200 
201 
202 /*---------------------------------------------------------------------*
203  *    Convert specified image files to pdf (one image file per page)   *
204  *---------------------------------------------------------------------*/
205 /*!
206  * \brief   convertFilesToPdf()
207  *
208  * \param[in]    dirname directory name containing images
209  * \param[in]    substr [optional] substring filter on filenames; can be NULL
210  * \param[in]    res input resolution of all images
211  * \param[in]    scalefactor scaling factor applied to each image; > 0.0
212  * \param[in]    type encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
213  *                    L_FLATE_ENCODE, or 0 for default
214  * \param[in]    quality used for JPEG only; 0 for default (75)
215  * \param[in]    title [optional] pdf title; if null, taken from the first
216  *                     image filename
217  * \param[in]    fileout pdf file of all images
218  * \return  0 if OK, 1 on error
219  *
220  * <pre>
221  * Notes:
222  *      (1) If %substr is not NULL, only image filenames that contain
223  *          the substring can be used.  If %substr == NULL, all files
224  *          in the directory are used.
225  *      (2) The files in the directory, after optional filtering by
226  *          the substring, are lexically sorted in increasing order
227  *          before concatenation.
228  *      (3) The scalefactor is applied to each image before encoding.
229  *          If you enter a value <= 0.0, it will be set to 1.0.
230  *      (4) Specifying one of the three encoding types for %type forces
231  *          all images to be compressed with that type.  Use 0 to have
232  *          the type determined for each image based on depth and whether
233  *          or not it has a colormap.
234  * </pre>
235  */
236 l_int32
convertFilesToPdf(const char * dirname,const char * substr,l_int32 res,l_float32 scalefactor,l_int32 type,l_int32 quality,const char * title,const char * fileout)237 convertFilesToPdf(const char  *dirname,
238                   const char  *substr,
239                   l_int32      res,
240                   l_float32    scalefactor,
241                   l_int32      type,
242                   l_int32      quality,
243                   const char  *title,
244                   const char  *fileout)
245 {
246 l_int32  ret;
247 SARRAY  *sa;
248 
249     PROCNAME("convertFilesToPdf");
250 
251     if (!dirname)
252         return ERROR_INT("dirname not defined", procName, 1);
253     if (!fileout)
254         return ERROR_INT("fileout not defined", procName, 1);
255 
256     if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
257         return ERROR_INT("sa not made", procName, 1);
258     ret = saConvertFilesToPdf(sa, res, scalefactor, type, quality,
259                               title, fileout);
260     sarrayDestroy(&sa);
261     return ret;
262 }
263 
264 
265 /*!
266  * \brief   saConvertFilesToPdf()
267  *
268  * \param[in]    sa string array of pathnames for images
269  * \param[in]    res input resolution of all images
270  * \param[in]    scalefactor scaling factor applied to each image; > 0.0
271  * \param[in]    type encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
272  *                    L_FLATE_ENCODE, or 0 for default
273  * \param[in]    quality used for JPEG only; 0 for default (75)
274  * \param[in]    title [optional] pdf title; if null, taken from the first
275  *                     image filename
276  * \param[in]    fileout pdf file of all images
277  * \return  0 if OK, 1 on error
278  *
279  * <pre>
280  * Notes:
281  *      (1) See convertFilesToPdf().
282  * </pre>
283  */
284 l_int32
saConvertFilesToPdf(SARRAY * sa,l_int32 res,l_float32 scalefactor,l_int32 type,l_int32 quality,const char * title,const char * fileout)285 saConvertFilesToPdf(SARRAY      *sa,
286                     l_int32      res,
287                     l_float32    scalefactor,
288                     l_int32      type,
289                     l_int32      quality,
290                     const char  *title,
291                     const char  *fileout)
292 {
293 l_uint8  *data;
294 l_int32   ret;
295 size_t    nbytes;
296 
297     PROCNAME("saConvertFilesToPdf");
298 
299     if (!sa)
300         return ERROR_INT("sa not defined", procName, 1);
301 
302     ret = saConvertFilesToPdfData(sa, res, scalefactor, type, quality,
303                                   title, &data, &nbytes);
304     if (ret) {
305         if (data) LEPT_FREE(data);
306         return ERROR_INT("pdf data not made", procName, 1);
307     }
308 
309     ret = l_binaryWrite(fileout, "w", data, nbytes);
310     LEPT_FREE(data);
311     if (ret)
312         L_ERROR("pdf data not written to file\n", procName);
313     return ret;
314 }
315 
316 
317 /*!
318  * \brief   saConvertFilesToPdfData()
319  *
320  * \param[in]    sa string array of pathnames for images
321  * \param[in]    res input resolution of all images
322  * \param[in]    scalefactor scaling factor applied to each image; > 0.0
323  * \param[in]    type encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
324  *                    L_FLATE_ENCODE, or 0 for default
325  * \param[in]    quality used for JPEG only; 0 for default (75)
326  * \param[in]    title [optional] pdf title; if null, taken from the first
327  *                     image filename
328  * \param[out]   pdata output pdf data (of all images
329  * \param[out]   pnbytes size of output pdf data
330  * \return  0 if OK, 1 on error
331  *
332  * <pre>
333  * Notes:
334  *      (1) See convertFilesToPdf().
335  * </pre>
336  */
337 l_int32
saConvertFilesToPdfData(SARRAY * sa,l_int32 res,l_float32 scalefactor,l_int32 type,l_int32 quality,const char * title,l_uint8 ** pdata,size_t * pnbytes)338 saConvertFilesToPdfData(SARRAY      *sa,
339                         l_int32      res,
340                         l_float32    scalefactor,
341                         l_int32      type,
342                         l_int32      quality,
343                         const char  *title,
344                         l_uint8    **pdata,
345                         size_t      *pnbytes)
346 {
347 char        *fname;
348 const char  *pdftitle;
349 l_uint8     *imdata;
350 l_int32      i, n, ret, pagetype, npages, scaledres;
351 size_t       imbytes;
352 L_BYTEA     *ba;
353 PIX         *pixs, *pix;
354 L_PTRA      *pa_data;
355 
356     PROCNAME("saConvertFilesToPdfData");
357 
358     if (!pdata)
359         return ERROR_INT("&data not defined", procName, 1);
360     *pdata = NULL;
361     if (!pnbytes)
362         return ERROR_INT("&nbytes not defined", procName, 1);
363     *pnbytes = 0;
364     if (!sa)
365         return ERROR_INT("sa not defined", procName, 1);
366     if (scalefactor <= 0.0) scalefactor = 1.0;
367     if (type < 0 || type > L_FLATE_ENCODE) {
368         L_WARNING("invalid compression type; using per-page default\n",
369                   procName);
370         type = 0;
371     }
372 
373         /* Generate all the encoded pdf strings */
374     n = sarrayGetCount(sa);
375     pa_data = ptraCreate(n);
376     pdftitle = NULL;
377     for (i = 0; i < n; i++) {
378         if (i && (i % 10 == 0)) fprintf(stderr, ".. %d ", i);
379         fname = sarrayGetString(sa, i, L_NOCOPY);
380         if ((pixs = pixRead(fname)) == NULL) {
381             L_ERROR("image not readable from file %s\n", procName, fname);
382             continue;
383         }
384         if (!pdftitle)
385             pdftitle = (title) ? title : fname;
386         if (scalefactor != 1.0)
387             pix = pixScale(pixs, scalefactor, scalefactor);
388         else
389             pix = pixClone(pixs);
390         pixDestroy(&pixs);
391         scaledres = (l_int32)(res * scalefactor);
392         if (type != 0) {
393             pagetype = type;
394         } else if (selectDefaultPdfEncoding(pix, &pagetype) != 0) {
395             pixDestroy(&pix);
396             L_ERROR("encoding type selection failed for file %s\n",
397                     procName, fname);
398             continue;
399         }
400         ret = pixConvertToPdfData(pix, pagetype, quality, &imdata, &imbytes,
401                                   0, 0, scaledres, pdftitle, NULL, 0);
402         pixDestroy(&pix);
403         if (ret) {
404             LEPT_FREE(imdata);
405             L_ERROR("pdf encoding failed for %s\n", procName, fname);
406             continue;
407         }
408         ba = l_byteaInitFromMem(imdata, imbytes);
409         LEPT_FREE(imdata);
410         ptraAdd(pa_data, ba);
411     }
412     ptraGetActualCount(pa_data, &npages);
413     if (npages == 0) {
414         L_ERROR("no pdf files made\n", procName);
415         ptraDestroy(&pa_data, FALSE, FALSE);
416         return 1;
417     }
418 
419         /* Concatenate them */
420     fprintf(stderr, "\nconcatenating ... ");
421     ret = ptraConcatenatePdfToData(pa_data, NULL, pdata, pnbytes);
422     fprintf(stderr, "done\n");
423 
424     ptraGetActualCount(pa_data, &npages);  /* recalculate in case it changes */
425     for (i = 0; i < npages; i++) {
426         ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
427         l_byteaDestroy(&ba);
428     }
429     ptraDestroy(&pa_data, FALSE, FALSE);
430     return ret;
431 }
432 
433 
434 /*!
435  * \brief   selectDefaultPdfEncoding()
436  *
437  * \param[in]    pix
438  * \param[out]   ptype L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE
439  *
440  * <pre>
441  * Notes:
442  *      (1) This attempts to choose an encoding for the pix that results
443  *          in the smallest file, assuming that if jpeg encoded, it will
444  *          use quality = 75.  The decision is approximate, in that
445  *          (a) all colormapped images will be losslessly encoded with
446  *          gzip (flate), and (b) an image with less than about 20 colors
447  *          is likely to be smaller if flate encoded than if encoded
448  *          as a jpeg (dct).  For example, an image made by pixScaleToGray3()
449  *          will have 10 colors, and flate encoding will give about
450  *          twice the compression as jpeg with quality = 75.
451  * </pre>
452  */
453 l_int32
selectDefaultPdfEncoding(PIX * pix,l_int32 * ptype)454 selectDefaultPdfEncoding(PIX      *pix,
455                          l_int32  *ptype)
456 {
457 l_int32   w, h, d, factor, ncolors;
458 PIXCMAP  *cmap;
459 
460     PROCNAME("selectDefaultPdfEncoding");
461 
462     if (!pix)
463         return ERROR_INT("pix not defined", procName, 1);
464     if (!ptype)
465         return ERROR_INT("&type not defined", procName, 1);
466     *ptype = L_FLATE_ENCODE;  /* default universal encoding */
467     pixGetDimensions(pix, &w, &h, &d);
468     cmap = pixGetColormap(pix);
469     if (d == 8 && !cmap) {
470         factor = L_MAX(1, (l_int32)sqrt((l_float64)(w * h) / 20000.));
471         pixNumColors(pix, factor, &ncolors);
472         if (ncolors < 20)
473             *ptype = L_FLATE_ENCODE;
474         else
475             *ptype = L_JPEG_ENCODE;
476     } else if (d == 1) {
477         *ptype = L_G4_ENCODE;
478     } else if (cmap || d == 2 || d == 4) {
479         *ptype = L_FLATE_ENCODE;
480     } else if (d == 8 || d == 32) {
481         *ptype = L_JPEG_ENCODE;
482     } else {
483         return ERROR_INT("type selection failure", procName, 1);
484     }
485 
486     return 0;
487 }
488 
489 
490 /*---------------------------------------------------------------------*
491  *          Convert specified image files to pdf without scaling       *
492  *---------------------------------------------------------------------*/
493 /*!
494  * \brief   convertUnscaledFilesToPdf()
495  *
496  * \param[in]    dirname directory name containing images
497  * \param[in]    substr [optional] substring filter on filenames; can be NULL
498  * \param[in]    title [optional] pdf title; if null, taken from the first
499  *                     image filename
500  * \param[in]    fileout pdf file of all images
501  * \return  0 if OK, 1 on error
502  *
503  * <pre>
504  * Notes:
505  *      (1) If %substr is not NULL, only image filenames that contain
506  *          the substring can be used.  If %substr == NULL, all files
507  *          in the directory are used.
508  *      (2) The files in the directory, after optional filtering by
509  *          the substring, are lexically sorted in increasing order
510  *          before concatenation.
511  *      (3) For jpeg and jp2k, this is very fast because the compressed
512  *          data is wrapped up and concatenated.  For png and tiffg4,
513  *          the images must be read and recompressed.
514  * </pre>
515  */
516 l_int32
convertUnscaledFilesToPdf(const char * dirname,const char * substr,const char * title,const char * fileout)517 convertUnscaledFilesToPdf(const char  *dirname,
518                           const char  *substr,
519                           const char  *title,
520                           const char  *fileout)
521 {
522 l_int32  ret;
523 SARRAY  *sa;
524 
525     PROCNAME("convertUnscaledFilesToPdf");
526 
527     if (!dirname)
528         return ERROR_INT("dirname not defined", procName, 1);
529     if (!fileout)
530         return ERROR_INT("fileout not defined", procName, 1);
531 
532     if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
533         return ERROR_INT("sa not made", procName, 1);
534     ret = saConvertUnscaledFilesToPdf(sa, title, fileout);
535     sarrayDestroy(&sa);
536     return ret;
537 }
538 
539 
540 /*!
541  * \brief   saConvertUnscaledFilesToPdf()
542  *
543  * \param[in]    sa string array of pathnames for images
544  * \param[in]    title [optional] pdf title; if null, taken from the first
545  *                     image filename
546  * \param[in]    fileout pdf file of all images
547  * \return  0 if OK, 1 on error
548  *
549  * <pre>
550  * Notes:
551  *      (1) See convertUnscaledFilesToPdf().
552  * </pre>
553  */
554 l_int32
saConvertUnscaledFilesToPdf(SARRAY * sa,const char * title,const char * fileout)555 saConvertUnscaledFilesToPdf(SARRAY      *sa,
556                             const char  *title,
557                             const char  *fileout)
558 {
559 l_uint8  *data;
560 l_int32   ret;
561 size_t    nbytes;
562 
563     PROCNAME("saConvertUnscaledFilesToPdf");
564 
565     if (!sa)
566         return ERROR_INT("sa not defined", procName, 1);
567 
568     ret = saConvertUnscaledFilesToPdfData(sa, title, &data, &nbytes);
569     if (ret) {
570         if (data) LEPT_FREE(data);
571         return ERROR_INT("pdf data not made", procName, 1);
572     }
573 
574     ret = l_binaryWrite(fileout, "w", data, nbytes);
575     LEPT_FREE(data);
576     if (ret)
577         L_ERROR("pdf data not written to file\n", procName);
578     return ret;
579 }
580 
581 
582 /*!
583  * \brief   saConvertUnscaledFilesToPdfData()
584  *
585  * \param[in]    sa string array of pathnames for images
586  * \param[in]    title [optional] pdf title; if null, taken from the first
587  *                     image filename
588  * \param[out]   pdata output pdf data (of all images)
589  * \param[out]   pnbytes size of output pdf data
590  * \return  0 if OK, 1 on error
591  */
592 l_int32
saConvertUnscaledFilesToPdfData(SARRAY * sa,const char * title,l_uint8 ** pdata,size_t * pnbytes)593 saConvertUnscaledFilesToPdfData(SARRAY      *sa,
594                                 const char  *title,
595                                 l_uint8    **pdata,
596                                 size_t      *pnbytes)
597 {
598 char         *fname;
599 l_uint8      *imdata;
600 l_int32       i, n, ret, npages;
601 size_t        imbytes;
602 L_BYTEA      *ba;
603 L_PTRA       *pa_data;
604 
605     PROCNAME("saConvertUnscaledFilesToPdfData");
606 
607     if (!pdata)
608         return ERROR_INT("&data not defined", procName, 1);
609     *pdata = NULL;
610     if (!pnbytes)
611         return ERROR_INT("&nbytes not defined", procName, 1);
612     *pnbytes = 0;
613     if (!sa)
614         return ERROR_INT("sa not defined", procName, 1);
615 
616         /* Generate all the encoded pdf strings */
617     n = sarrayGetCount(sa);
618     pa_data = ptraCreate(n);
619     for (i = 0; i < n; i++) {
620         if (i && (i % 10 == 0)) fprintf(stderr, ".. %d ", i);
621         fname = sarrayGetString(sa, i, L_NOCOPY);
622 
623             /* Generate the pdf data */
624         if (convertUnscaledToPdfData(fname, title, &imdata, &imbytes))
625             continue;
626 
627             /* ... and add it to the array of single page data */
628         ba = l_byteaInitFromMem(imdata, imbytes);
629         if (imdata) LEPT_FREE(imdata);
630         ptraAdd(pa_data, ba);
631     }
632     ptraGetActualCount(pa_data, &npages);
633     if (npages == 0) {
634         L_ERROR("no pdf files made\n", procName);
635         ptraDestroy(&pa_data, FALSE, FALSE);
636         return 1;
637     }
638 
639         /* Concatenate to generate a multipage pdf */
640     fprintf(stderr, "\nconcatenating ... ");
641     ret = ptraConcatenatePdfToData(pa_data, NULL, pdata, pnbytes);
642     fprintf(stderr, "done\n");
643 
644         /* Clean up */
645     ptraGetActualCount(pa_data, &npages);  /* maybe failed to read some files */
646     for (i = 0; i < npages; i++) {
647         ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
648         l_byteaDestroy(&ba);
649     }
650     ptraDestroy(&pa_data, FALSE, FALSE);
651     return ret;
652 }
653 
654 
655 /*!
656  * \brief   convertUnscaledToPdfData()
657  *
658  * \param[in]    fname of image file
659  * \param[in]    title [optional] pdf title; can be NULL
660  * \param[out]   pdata output pdf data for image
661  * \param[out]   pnbytes size of output pdf data
662  * \return  0 if OK, 1 on error
663  */
664 l_int32
convertUnscaledToPdfData(const char * fname,const char * title,l_uint8 ** pdata,size_t * pnbytes)665 convertUnscaledToPdfData(const char  *fname,
666                          const char  *title,
667                          l_uint8    **pdata,
668                          size_t      *pnbytes)
669 {
670 const char   *pdftitle = NULL;
671 char         *tail = NULL;
672 l_int32       format;
673 L_COMP_DATA  *cid;
674 
675     PROCNAME("convertUnscaledToPdfData");
676 
677     if (!pdata)
678         return ERROR_INT("&data not defined", procName, 1);
679     *pdata = NULL;
680     if (!pnbytes)
681         return ERROR_INT("&nbytes not defined", procName, 1);
682     *pnbytes = 0;
683     if (!fname)
684         return ERROR_INT("fname not defined", procName, 1);
685 
686     findFileFormat(fname, &format);
687     if (format == IFF_UNKNOWN) {
688         L_WARNING("file %s format is unknown; skip\n", procName, fname);
689         return 1;
690     }
691     if (format == IFF_PS || format == IFF_LPDF) {
692         L_WARNING("file %s format is %d; skip\n", procName, fname, format);
693         return 1;
694     }
695 
696         /* Generate the image data required for pdf generation, always
697          * in binary (not ascii85) coding; jpeg files are never transcoded.  */
698     l_generateCIDataForPdf(fname, NULL, 0, &cid);
699     if (!cid) {
700         L_ERROR("file %s format is %d; unreadable\n", procName, fname, format);
701         return 1;
702     }
703 
704         /* If %title == NULL, use the tail of %fname. */
705     if (title) {
706         pdftitle = title;
707     } else {
708         splitPathAtDirectory(fname, NULL, &tail);
709         pdftitle = tail;
710     }
711 
712         /* Generate the pdf string for this page (image).  This destroys
713          * the cid by attaching it to an lpd and destroying the lpd. */
714     cidConvertToPdfData(cid, pdftitle, pdata, pnbytes);
715     LEPT_FREE(tail);
716     return 0;
717 }
718 
719 
720 /*---------------------------------------------------------------------*
721  *          Convert multiple images to pdf (one image per page)        *
722  *---------------------------------------------------------------------*/
723 /*!
724  * \brief   pixaConvertToPdf()
725  *
726  * \param[in]    pixa containing images all at the same resolution
727  * \param[in]    res override the resolution of each input image, in ppi;
728  *                   use 0 to respect the resolution embedded in the input
729  * \param[in]    scalefactor scaling factor applied to each image; > 0.0
730  * \param[in]    type encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
731  *                    L_FLATE_ENCODE, or 0 for default
732  * \param[in]    quality used for JPEG only; 0 for default (75)
733  * \param[in]    title [optional] pdf title
734  * \param[in]    fileout pdf file of all images
735  * \return  0 if OK, 1 on error
736  *
737  * <pre>
738  * Notes:
739  *      (1) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without
740  *          colormap and many colors, or 32 bpp; FLATE for anything else.
741  *      (2) The scalefactor must be > 0.0; otherwise it is set to 1.0.
742  *      (3) Specifying one of the three encoding types for %type forces
743  *          all images to be compressed with that type.  Use 0 to have
744  *          the type determined for each image based on depth and whether
745  *          or not it has a colormap.
746  * </pre>
747  */
748 l_int32
pixaConvertToPdf(PIXA * pixa,l_int32 res,l_float32 scalefactor,l_int32 type,l_int32 quality,const char * title,const char * fileout)749 pixaConvertToPdf(PIXA        *pixa,
750                  l_int32      res,
751                  l_float32    scalefactor,
752                  l_int32      type,
753                  l_int32      quality,
754                  const char  *title,
755                  const char  *fileout)
756 {
757 l_uint8  *data;
758 l_int32   ret;
759 size_t    nbytes;
760 
761     PROCNAME("pixaConvertToPdf");
762 
763     if (!pixa)
764         return ERROR_INT("pixa not defined", procName, 1);
765 
766     ret = pixaConvertToPdfData(pixa, res, scalefactor, type, quality,
767                                title, &data, &nbytes);
768     if (ret) {
769         LEPT_FREE(data);
770         return ERROR_INT("conversion to pdf failed", procName, 1);
771     }
772 
773     ret = l_binaryWrite(fileout, "w", data, nbytes);
774     LEPT_FREE(data);
775     if (ret)
776         L_ERROR("pdf data not written to file\n", procName);
777     return ret;
778 }
779 
780 
781 /*!
782  * \brief   pixaConvertToPdfData()
783  *
784  * \param[in]    pixa containing images all at the same resolution
785  * \param[in]    res input resolution of all images
786  * \param[in]    scalefactor scaling factor applied to each image; > 0.0
787  * \param[in]    type encoding type (L_JPEG_ENCODE, L_G4_ENCODE,
788  *                    L_FLATE_ENCODE, or 0 for default
789  * \param[in]    quality used for JPEG only; 0 for default (75)
790  * \param[in]    title [optional] pdf title
791  * \param[out]   pdata output pdf data (of all images
792  * \param[out]   pnbytes size of output pdf data
793  * \return  0 if OK, 1 on error
794  *
795  * <pre>
796  * Notes:
797  *      (1) See pixaConvertToPdf().
798  * </pre>
799  */
800 l_int32
pixaConvertToPdfData(PIXA * pixa,l_int32 res,l_float32 scalefactor,l_int32 type,l_int32 quality,const char * title,l_uint8 ** pdata,size_t * pnbytes)801 pixaConvertToPdfData(PIXA        *pixa,
802                      l_int32      res,
803                      l_float32    scalefactor,
804                      l_int32      type,
805                      l_int32      quality,
806                      const char  *title,
807                      l_uint8    **pdata,
808                      size_t      *pnbytes)
809 {
810 l_uint8  *imdata;
811 l_int32   i, n, ret, scaledres, pagetype;
812 size_t    imbytes;
813 L_BYTEA  *ba;
814 PIX      *pixs, *pix;
815 L_PTRA   *pa_data;
816 
817     PROCNAME("pixaConvertToPdfData");
818 
819     if (!pdata)
820         return ERROR_INT("&data not defined", procName, 1);
821     *pdata = NULL;
822     if (!pnbytes)
823         return ERROR_INT("&nbytes not defined", procName, 1);
824     *pnbytes = 0;
825     if (!pixa)
826         return ERROR_INT("pixa not defined", procName, 1);
827     if (scalefactor <= 0.0) scalefactor = 1.0;
828     if (type < 0 || type > L_FLATE_ENCODE) {
829         L_WARNING("invalid compression type; using per-page default\n",
830                   procName);
831         type = 0;
832     }
833 
834         /* Generate all the encoded pdf strings */
835     n = pixaGetCount(pixa);
836     pa_data = ptraCreate(n);
837     for (i = 0; i < n; i++) {
838         if ((pixs = pixaGetPix(pixa, i, L_CLONE)) == NULL) {
839             L_ERROR("pix[%d] not retrieved\n", procName, i);
840             continue;
841         }
842         if (scalefactor != 1.0)
843             pix = pixScale(pixs, scalefactor, scalefactor);
844         else
845             pix = pixClone(pixs);
846         pixDestroy(&pixs);
847         scaledres = (l_int32)(res * scalefactor);
848         if (type != 0) {
849             pagetype = type;
850         } else if (selectDefaultPdfEncoding(pix, &pagetype) != 0) {
851             L_ERROR("encoding type selection failed for pix[%d]\n",
852                         procName, i);
853             pixDestroy(&pix);
854             continue;
855         }
856         ret = pixConvertToPdfData(pix, pagetype, quality, &imdata, &imbytes,
857                                   0, 0, scaledres, title, NULL, 0);
858         pixDestroy(&pix);
859         if (ret) {
860             LEPT_FREE(imdata);
861             L_ERROR("pdf encoding failed for pix[%d]\n", procName, i);
862             continue;
863         }
864         ba = l_byteaInitFromMem(imdata, imbytes);
865         LEPT_FREE(imdata);
866         ptraAdd(pa_data, ba);
867     }
868     ptraGetActualCount(pa_data, &n);
869     if (n == 0) {
870         L_ERROR("no pdf files made\n", procName);
871         ptraDestroy(&pa_data, FALSE, FALSE);
872         return 1;
873     }
874 
875         /* Concatenate them */
876     ret = ptraConcatenatePdfToData(pa_data, NULL, pdata, pnbytes);
877 
878     ptraGetActualCount(pa_data, &n);  /* recalculate in case it changes */
879     for (i = 0; i < n; i++) {
880         ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
881         l_byteaDestroy(&ba);
882     }
883     ptraDestroy(&pa_data, FALSE, FALSE);
884     return ret;
885 }
886 
887 
888 /*---------------------------------------------------------------------*
889  *                Single page, multi-image converters                  *
890  *---------------------------------------------------------------------*/
891 /*!
892  * \brief   convertToPdf()
893  *
894  * \param[in]      filein input image file -- any format
895  * \param[in]      type L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE
896  * \param[in]      quality used for JPEG only; 0 for default (75)
897  * \param[in]      fileout output pdf file; only required on last image on page
898  * \param[in]      x, y location of lower-left corner of image, in pixels,
899  *                      relative to the PostScript origin (0,0) at
900  *                      the lower-left corner of the page
901  * \param[in]      res override the resolution of the input image, in ppi;
902  *                     use 0 to respect the resolution embedded in the input
903  * \param[in]      title [optional] pdf title; if null, taken from filein
904  * \param[in,out]  plpd ptr to lpd, which is created on the first invocation
905  *                      and returned until last image is processed, at which
906  *                      time it is destroyed
907  * \param[in]      position in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
908  *                          L_LAST_IMAGE
909  * \return  0 if OK, 1 on error
910  *
911  * <pre>
912  * Notes:
913  *      (1) To wrap only one image in pdf, input %plpd = NULL, and
914  *          the value of %position will be ignored:
915  *            convertToPdf(...  type, quality, x, y, res, NULL, 0);
916  *      (2) To wrap multiple images on a single pdf page, this is called
917  *          once for each successive image.  Do it this way:
918  *            L_PDF_DATA   *lpd;
919  *            convertToPdf(...  type, quality, x, y, res, &lpd, L_FIRST_IMAGE);
920  *            convertToPdf(...  type, quality, x, y, res, &lpd, L_NEXT_IMAGE);
921  *            ...
922  *            convertToPdf(...  type, quality, x, y, res, &lpd, L_LAST_IMAGE);
923  *          This will write the result to the value of %fileout specified
924  *          in the first call; succeeding values of %fileout are ignored.
925  *          On the last call: the pdf data bytes are computed and written
926  *          to %fileout, lpd is destroyed internally, and the returned
927  *          value of lpd is null.  So the client has nothing to clean up.
928  *      (3) (a) Set %res == 0 to respect the resolution embedded in the
929  *              image file.  If no resolution is embedded, it will be set
930  *              to the default value.
931  *          (b) Set %res to some other value to override the file resolution.
932  *      (4) (a) If the input %res and the resolution of the output device
933  *              are equal, the image will be "displayed" at the same size
934  *              as the original.
935  *          (b) If the input %res is 72, the output device will render
936  *              the image at 1 pt/pixel.
937  *          (c) Some possible choices for the default input pix resolution are:
938  *                 72 ppi     Render pix on any output device at one pt/pixel
939  *                 96 ppi     Windows default for generated display images
940  *                300 ppi     Typical default for scanned images.
941  *              We choose 300, which is sensible for rendering page images.
942  *              However,  images come from a variety of sources, and
943  *              some are explicitly created for viewing on a display.
944  * </pre>
945  */
946 l_int32
convertToPdf(const char * filein,l_int32 type,l_int32 quality,const char * fileout,l_int32 x,l_int32 y,l_int32 res,const char * title,L_PDF_DATA ** plpd,l_int32 position)947 convertToPdf(const char   *filein,
948              l_int32       type,
949              l_int32       quality,
950              const char   *fileout,
951              l_int32       x,
952              l_int32       y,
953              l_int32       res,
954              const char   *title,
955              L_PDF_DATA  **plpd,
956              l_int32       position)
957 {
958 l_uint8  *data;
959 l_int32   ret;
960 size_t    nbytes;
961 
962     PROCNAME("convertToPdf");
963 
964     if (!filein)
965         return ERROR_INT("filein not defined", procName, 1);
966     if (!plpd || (position == L_LAST_IMAGE)) {
967         if (!fileout)
968             return ERROR_INT("fileout not defined", procName, 1);
969     }
970     if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
971         type != L_FLATE_ENCODE)
972         return ERROR_INT("invalid conversion type", procName, 1);
973 
974     if (convertToPdfData(filein, type, quality, &data, &nbytes, x, y,
975                          res, title, plpd, position))
976         return ERROR_INT("pdf data not made", procName, 1);
977 
978     if (!plpd || (position == L_LAST_IMAGE)) {
979         ret = l_binaryWrite(fileout, "w", data, nbytes);
980         LEPT_FREE(data);
981         if (ret)
982             return ERROR_INT("pdf data not written to file", procName, 1);
983     }
984 
985     return 0;
986 }
987 
988 
989 /*!
990  * \brief   convertImageDataToPdf()
991  *
992  * \param[in]      imdata array of formatted image data; e.g., png, jpeg
993  * \param[in]      size size of image data
994  * \param[in]      type L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE
995  * \param[in]      quality used for JPEG only; 0 for default (75)
996  * \param[in]      fileout output pdf file; only required on last image on page
997  * \param[in]      x, y location of lower-left corner of image, in pixels,
998  *                      relative to the PostScript origin (0,0) at
999  *                      the lower-left corner of the page
1000  * \param[in]      res override the resolution of the input image, in ppi;
1001  *                     use 0 to respect the resolution embedded in the input
1002  * \param[in]      title [optional] pdf title
1003  * \param[in,out]  plpd ptr to lpd, which is created on the first invocation
1004  *                      and returned until last image is processed, at which
1005  *                      time it is destroyed
1006  * \param[in]      position in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
1007  *                          L_LAST_IMAGE
1008  * \return  0 if OK, 1 on error
1009  *
1010  * <pre>
1011  * Notes:
1012  *      (1) If %res == 0 and the input resolution field is 0,
1013  *          this will use DEFAULT_INPUT_RES.
1014  *      (2) See comments in convertToPdf().
1015  * </pre>
1016  */
1017 l_int32
convertImageDataToPdf(l_uint8 * imdata,size_t size,l_int32 type,l_int32 quality,const char * fileout,l_int32 x,l_int32 y,l_int32 res,const char * title,L_PDF_DATA ** plpd,l_int32 position)1018 convertImageDataToPdf(l_uint8      *imdata,
1019                       size_t        size,
1020                       l_int32       type,
1021                       l_int32       quality,
1022                       const char   *fileout,
1023                       l_int32       x,
1024                       l_int32       y,
1025                       l_int32       res,
1026                       const char   *title,
1027                       L_PDF_DATA  **plpd,
1028                       l_int32       position)
1029 {
1030 l_int32  ret;
1031 PIX     *pix;
1032 
1033     PROCNAME("convertImageDataToPdf");
1034 
1035     if (!imdata)
1036         return ERROR_INT("image data not defined", procName, 1);
1037     if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
1038         type != L_FLATE_ENCODE)
1039         return ERROR_INT("invalid conversion type", procName, 1);
1040     if (!plpd || (position == L_LAST_IMAGE)) {
1041         if (!fileout)
1042             return ERROR_INT("fileout not defined", procName, 1);
1043     }
1044 
1045     if ((pix = pixReadMem(imdata, size)) == NULL)
1046         return ERROR_INT("pix not read", procName, 1);
1047     ret = pixConvertToPdf(pix, type, quality, fileout, x, y, res,
1048                           title, plpd, position);
1049     pixDestroy(&pix);
1050     return ret;
1051 }
1052 
1053 
1054 /*!
1055  * \brief   convertToPdfData()
1056  *
1057  * \param[in]      filein input image file -- any format
1058  * \param[in]      type L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE
1059  * \param[in]      quality used for JPEG only; 0 for default (75)
1060  * \param[out]     pdata pdf data in memory
1061  * \param[out]     pnbytes number of bytes in pdf data
1062  * \param[in]      x, y location of lower-left corner of image, in pixels,
1063  *                      relative to the PostScript origin (0,0) at
1064  *                      the lower-left corner of the page
1065  * \param[in]      res override the resolution of the input image, in ppi;
1066  *                     use 0 to respect the resolution embedded in the input
1067  * \param[in]      title [optional] pdf title; if null, use filein
1068  * \param[in,out]  plpd ptr to lpd, which is created on the first invocation
1069  *                      and returned until last image is processed, at which
1070  *                      time it is destroyed
1071  * \param[in]      position in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
1072  *                          L_LAST_IMAGE
1073  * \return  0 if OK, 1 on error
1074  *
1075  * <pre>
1076  * Notes:
1077  *      (1) If %res == 0 and the input resolution field is 0,
1078  *          this will use DEFAULT_INPUT_RES.
1079  *      (2) See comments in convertToPdf().
1080  * </pre>
1081  */
1082 l_int32
convertToPdfData(const char * filein,l_int32 type,l_int32 quality,l_uint8 ** pdata,size_t * pnbytes,l_int32 x,l_int32 y,l_int32 res,const char * title,L_PDF_DATA ** plpd,l_int32 position)1083 convertToPdfData(const char   *filein,
1084                  l_int32       type,
1085                  l_int32       quality,
1086                  l_uint8     **pdata,
1087                  size_t       *pnbytes,
1088                  l_int32       x,
1089                  l_int32       y,
1090                  l_int32       res,
1091                  const char   *title,
1092                  L_PDF_DATA  **plpd,
1093                  l_int32       position)
1094 {
1095 PIX  *pix;
1096 
1097     PROCNAME("convertToPdfData");
1098 
1099     if (!pdata)
1100         return ERROR_INT("&data not defined", procName, 1);
1101     *pdata = NULL;
1102     if (!pnbytes)
1103         return ERROR_INT("&nbytes not defined", procName, 1);
1104     *pnbytes = 0;
1105     if (!filein)
1106         return ERROR_INT("filein not defined", procName, 1);
1107     if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
1108         type != L_FLATE_ENCODE)
1109         return ERROR_INT("invalid conversion type", procName, 1);
1110 
1111     if ((pix = pixRead(filein)) == NULL)
1112         return ERROR_INT("pix not made", procName, 1);
1113 
1114     pixConvertToPdfData(pix, type, quality, pdata, pnbytes,
1115                         x, y, res, (title) ? title : filein, plpd, position);
1116     pixDestroy(&pix);
1117     return 0;
1118 }
1119 
1120 
1121 /*!
1122  * \brief   convertImageDataToPdfData()
1123  *
1124  * \param[in]    imdata array of formatted image data; e.g., png, jpeg
1125  * \param[in]    size size of image data
1126  * \param[in]    type L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE
1127  * \param[in]    quality used for JPEG only; 0 for default (75)
1128  * \param[out]   pdata pdf data in memory
1129  * \param[out]   pnbytes number of bytes in pdf data
1130  * \param[in]    x, y location of lower-left corner of image, in pixels,
1131  *                    relative to the PostScript origin (0,0) at
1132  *                     the lower-left corner of the page
1133  * \param[in]    res override the resolution of the input image, in ppi;
1134  *                   use 0 to respect the resolution embedded in the input
1135  * \param[in]    title [optional] pdf title
1136  * \param[out]   plpd ptr to lpd, which is created on the first invocation
1137  *                    and returned until last image is processed, at which
1138  *                    time it is destroyed
1139  * \param[in]    position in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
1140  *                       L_LAST_IMAGE
1141  * \return  0 if OK, 1 on error
1142  *
1143  * <pre>
1144  * Notes:
1145  *      (1) If %res == 0 and the input resolution field is 0,
1146  *          this will use DEFAULT_INPUT_RES.
1147  *      (2) See comments in convertToPdf().
1148  * </pre>
1149  */
1150 l_int32
convertImageDataToPdfData(l_uint8 * imdata,size_t size,l_int32 type,l_int32 quality,l_uint8 ** pdata,size_t * pnbytes,l_int32 x,l_int32 y,l_int32 res,const char * title,L_PDF_DATA ** plpd,l_int32 position)1151 convertImageDataToPdfData(l_uint8      *imdata,
1152                           size_t        size,
1153                           l_int32       type,
1154                           l_int32       quality,
1155                           l_uint8     **pdata,
1156                           size_t       *pnbytes,
1157                           l_int32       x,
1158                           l_int32       y,
1159                           l_int32       res,
1160                           const char   *title,
1161                           L_PDF_DATA  **plpd,
1162                           l_int32       position)
1163 {
1164 l_int32  ret;
1165 PIX     *pix;
1166 
1167     PROCNAME("convertImageDataToPdfData");
1168 
1169     if (!pdata)
1170         return ERROR_INT("&data not defined", procName, 1);
1171     *pdata = NULL;
1172     if (!pnbytes)
1173         return ERROR_INT("&nbytes not defined", procName, 1);
1174     *pnbytes = 0;
1175     if (!imdata)
1176         return ERROR_INT("image data not defined", procName, 1);
1177     if (plpd) {  /* part of multi-page invocation */
1178         if (position == L_FIRST_IMAGE)
1179             *plpd = NULL;
1180     }
1181 
1182     if ((pix = pixReadMem(imdata, size)) == NULL)
1183         return ERROR_INT("pix not read", procName, 1);
1184     ret = pixConvertToPdfData(pix, type, quality, pdata, pnbytes,
1185                               x, y, res, title, plpd, position);
1186     pixDestroy(&pix);
1187     return ret;
1188 }
1189 
1190 
1191 /*!
1192  * \brief   pixConvertToPdf()
1193  *
1194  * \param[in]      pix
1195  * \param[in]      type L_G4_ENCODE, L_JPEG_ENCODE, L_FLATE_ENCODE
1196  * \param[in]      quality used for JPEG only; 0 for default (75)
1197  * \param[in]      fileout output pdf file; only required on last image on page
1198  * \param[in]      x, y location of lower-left corner of image, in pixels,
1199  *                      relative to the PostScript origin (0,0 at
1200  *                      the lower-left corner of the page)
1201  * \param[in]      res override the resolution of the input image, in ppi;
1202  *                     use 0 to respect the resolution embedded in the input
1203  * \param[in]      title [optional] pdf title
1204  * \param[in,out]  plpd ptr to lpd, which is created on the first invocation
1205  *                      and returned until last image is processed
1206  * \param[in]      position in image sequence: L_FIRST_IMAGE, L_NEXT_IMAGE,
1207  *                          L_LAST_IMAGE
1208  * \return  0 if OK, 1 on error
1209  *
1210  * <pre>
1211  * Notes:
1212  *      (1) If %res == 0 and the input resolution field is 0,
1213  *          this will use DEFAULT_INPUT_RES.
1214  *      (2) This only writes data to fileout if it is the last
1215  *          image to be written on the page.
1216  *      (3) See comments in convertToPdf().
1217  * </pre>
1218  */
1219 l_int32
pixConvertToPdf(PIX * pix,l_int32 type,l_int32 quality,const char * fileout,l_int32 x,l_int32 y,l_int32 res,const char * title,L_PDF_DATA ** plpd,l_int32 position)1220 pixConvertToPdf(PIX          *pix,
1221                 l_int32       type,
1222                 l_int32       quality,
1223                 const char   *fileout,
1224                 l_int32       x,
1225                 l_int32       y,
1226                 l_int32       res,
1227                 const char   *title,
1228                 L_PDF_DATA  **plpd,
1229                 l_int32       position)
1230 {
1231 l_uint8  *data;
1232 l_int32   ret;
1233 size_t    nbytes;
1234 
1235     PROCNAME("pixConvertToPdf");
1236 
1237     if (!pix)
1238         return ERROR_INT("pix not defined", procName, 1);
1239     if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
1240         type != L_FLATE_ENCODE)
1241         return ERROR_INT("invalid conversion type", procName, 1);
1242     if (!plpd || (position == L_LAST_IMAGE)) {
1243         if (!fileout)
1244             return ERROR_INT("fileout not defined", procName, 1);
1245     }
1246 
1247     if (pixConvertToPdfData(pix, type, quality, &data, &nbytes,
1248                             x, y, res, title, plpd, position)) {
1249         LEPT_FREE(data);
1250         return ERROR_INT("pdf data not made", procName, 1);
1251     }
1252 
1253     if (!plpd || (position == L_LAST_IMAGE)) {
1254         ret = l_binaryWrite(fileout, "w", data, nbytes);
1255         LEPT_FREE(data);
1256         if (ret)
1257             return ERROR_INT("pdf data not written to file", procName, 1);
1258     }
1259     return 0;
1260 }
1261 
1262 
1263 /*!
1264  * \brief   pixWriteStreamPdf()
1265  *
1266  * \param[in]    fp file stream opened for writing
1267  * \param[in]    pix all depths, cmap OK
1268  * \param[in]    res override the resolution of the input image, in ppi;
1269  *                   use 0 to respect the resolution embedded in the input
1270  * \param[in]    title [optional] pdf title; taken from the first image
1271  *                     placed on a page; e.g., an input image filename
1272  * \return  0 if OK, 1 on error
1273  *
1274  * <pre>
1275  * Notes:
1276  *      (1) This is the simplest interface for writing a single image
1277  *          with pdf encoding to a stream.  It uses G4 encoding for 1 bpp,
1278  *          JPEG encoding for 8 bpp (no cmap) and 32 bpp, and FLATE
1279  *          encoding for everything else.
1280  * </pre>
1281  */
1282 l_int32
pixWriteStreamPdf(FILE * fp,PIX * pix,l_int32 res,const char * title)1283 pixWriteStreamPdf(FILE        *fp,
1284                   PIX         *pix,
1285                   l_int32      res,
1286                   const char  *title)
1287 {
1288 l_uint8  *data;
1289 size_t    nbytes, nbytes_written;
1290 
1291     PROCNAME("pixWriteStreamPdf");
1292 
1293     if (!fp)
1294         return ERROR_INT("stream not opened", procName, 1);
1295     if (!pix)
1296         return ERROR_INT("pix not defined", procName, 1);
1297 
1298     if (pixWriteMemPdf(&data, &nbytes, pix, res, title) != 0) {
1299         LEPT_FREE(data);
1300         return ERROR_INT("pdf data not made", procName, 1);
1301     }
1302 
1303     nbytes_written = fwrite(data, 1, nbytes, fp);
1304     LEPT_FREE(data);
1305     if (nbytes != nbytes_written)
1306         return ERROR_INT("failure writing pdf data to stream", procName, 1);
1307     return 0;
1308 }
1309 
1310 
1311 /*!
1312  * \brief   pixWriteMemPdf()
1313  *
1314  * \param[out]   pdata pdf as byte array
1315  * \param[out]   pnbytes number of bytes in pdf array
1316  * \param[in]    pix all depths, cmap OK
1317  * \param[in]    res override the resolution of the input image, in ppi;
1318  *                   use 0 to respect the resolution embedded in the input
1319  * \param[in]    title [optional] pdf title; taken from the first image
1320  *                     placed on a page; e.g., an input image filename
1321  * \return  0 if OK, 1 on error
1322  *
1323  * <pre>
1324  * Notes:
1325  *      (1) This is the simplest interface for writing a single image
1326  *          with pdf encoding to memory.  It uses G4 encoding for 1 bpp,
1327  *          JPEG encoding for 8 bpp (no cmap) and 32 bpp, and FLATE
1328  *          encoding for everything else.
1329  * </pre>
1330  */
1331 l_int32
pixWriteMemPdf(l_uint8 ** pdata,size_t * pnbytes,PIX * pix,l_int32 res,const char * title)1332 pixWriteMemPdf(l_uint8    **pdata,
1333                size_t      *pnbytes,
1334                PIX         *pix,
1335                l_int32      res,
1336                const char  *title)
1337 {
1338 l_int32   ret, d, type;
1339 PIXCMAP  *cmap;
1340 
1341     PROCNAME("pixWriteMemPdf");
1342 
1343     if (pdata) *pdata = NULL;
1344     if (pnbytes) *pnbytes = 0;
1345     if (!pdata || !pnbytes)
1346         return ERROR_INT("&data or &nbytes not defined", procName, 1);
1347     if (!pix)
1348         return ERROR_INT("pix not defined", procName, 1);
1349 
1350     d = pixGetDepth(pix);
1351     cmap = pixGetColormap(pix);
1352     if (d == 1)
1353         type = L_G4_ENCODE;
1354     else if (cmap || d == 2 || d == 4 || d == 16)
1355         type = L_FLATE_ENCODE;
1356     else  /* d == 8 (no cmap) or d == 32 */
1357         type = L_JPEG_ENCODE;
1358 
1359     ret = pixConvertToPdfData(pix, type, 75, pdata, pnbytes,
1360                               0, 0, res, title, NULL, 0);
1361     if (ret)
1362         return ERROR_INT("pdf data not made", procName, 1);
1363     return 0;
1364 }
1365 
1366 
1367 /*---------------------------------------------------------------------*
1368  *            Segmented multi-page, multi-image converter              *
1369  *---------------------------------------------------------------------*/
1370 /*!
1371  * \brief   convertSegmentedFilesToPdf()
1372  *
1373  * \param[in]    dirname directory name containing images
1374  * \param[in]    substr [optional] substring filter on filenames; can be NULL
1375  * \param[in]    res input resolution of all images
1376  * \param[in]    type compression type for non-image regions; the
1377  *                    image regions are always compressed with L_JPEG_ENCODE
1378  * \param[in]    thresh used for converting gray --> 1 bpp with L_G4_ENCODE
1379  * \param[in]    baa [optional] boxaa of image regions
1380  * \param[in]    quality used for JPEG only; 0 for default (75)
1381  * \param[in]    scalefactor scaling factor applied to each image region
1382  * \param[in]    title [optional] pdf title; if null, taken from the first
1383  *                     image filename
1384  * \param[in]    fileout pdf file of all images
1385  * \return  0 if OK, 1 on error
1386  *
1387  * <pre>
1388  * Notes:
1389  *      (1) If %substr is not NULL, only image filenames that contain
1390  *          the substring can be used.  If %substr == NULL, all files
1391  *          in the directory are used.
1392  *      (2) The files in the directory, after optional filtering by
1393  *          the substring, are lexically sorted in increasing order
1394  *          before concatenation.
1395  *      (3) The images are encoded with G4 if 1 bpp; JPEG if 8 bpp without
1396  *          colormap and many colors, or 32 bpp; FLATE for anything else.
1397  *      (4) The boxaa, if it exists, contains one boxa of "image regions"
1398  *          for each image file.  The boxa must be aligned with the
1399  *          sorted set of images.
1400  *      (5) The scalefactor is applied to each image region.  It is
1401  *          typically < 1.0, to save bytes in the final pdf, because
1402  *          the resolution is often not critical in non-text regions.
1403  *      (6) If the non-image regions have pixel depth > 1 and the encoding
1404  *          type is G4, they are automatically scaled up by 2x and
1405  *          thresholded.  Otherwise, no scaling is performed on them.
1406  *      (7) Note that this function can be used to generate multipage
1407  *          G4 compressed pdf from any input, by using %boxaa == NULL
1408  *          and %type == L_G4_ENCODE.
1409  * </pre>
1410  */
1411 l_int32
convertSegmentedFilesToPdf(const char * dirname,const char * substr,l_int32 res,l_int32 type,l_int32 thresh,BOXAA * baa,l_int32 quality,l_float32 scalefactor,const char * title,const char * fileout)1412 convertSegmentedFilesToPdf(const char  *dirname,
1413                            const char  *substr,
1414                            l_int32      res,
1415                            l_int32      type,
1416                            l_int32      thresh,
1417                            BOXAA       *baa,
1418                            l_int32      quality,
1419                            l_float32    scalefactor,
1420                            const char  *title,
1421                            const char  *fileout)
1422 {
1423 char     *fname;
1424 l_uint8  *imdata, *data;
1425 l_int32   i, npages, nboxa, nboxes, ret;
1426 size_t    imbytes, databytes;
1427 BOXA     *boxa;
1428 L_BYTEA  *ba;
1429 L_PTRA   *pa_data;
1430 SARRAY   *sa;
1431 
1432     PROCNAME("convertSegmentedFilesToPdf");
1433 
1434     if (!dirname)
1435         return ERROR_INT("dirname not defined", procName, 1);
1436     if (!fileout)
1437         return ERROR_INT("fileout not defined", procName, 1);
1438 
1439     if ((sa = getNumberedPathnamesInDirectory(dirname, substr, 0, 0, 10000))
1440             == NULL)
1441         return ERROR_INT("sa not made", procName, 1);
1442 
1443     npages = sarrayGetCount(sa);
1444         /* If necessary, extend the boxaa, which is page-aligned with
1445          * the image files, to be as large as the set of images. */
1446     if (baa) {
1447         nboxa = boxaaGetCount(baa);
1448         if (nboxa < npages) {
1449             boxa = boxaCreate(1);
1450             boxaaExtendWithInit(baa, npages, boxa);
1451             boxaDestroy(&boxa);
1452         }
1453     }
1454 
1455         /* Generate and save all the encoded pdf strings */
1456     pa_data = ptraCreate(npages);
1457     for (i = 0; i < npages; i++) {
1458         fname = sarrayGetString(sa, i, L_NOCOPY);
1459         if (!strcmp(fname, "")) continue;
1460         boxa = NULL;
1461         if (baa) {
1462             boxa = boxaaGetBoxa(baa, i, L_CLONE);
1463             nboxes = boxaGetCount(boxa);
1464             if (nboxes == 0)
1465                 boxaDestroy(&boxa);
1466         }
1467         ret = convertToPdfDataSegmented(fname, res, type, thresh, boxa,
1468                                         quality, scalefactor, title,
1469                                         &imdata, &imbytes);
1470         boxaDestroy(&boxa);  /* safe; in case nboxes > 0 */
1471         if (ret) {
1472             L_ERROR("pdf encoding failed for %s\n", procName, fname);
1473             continue;
1474         }
1475         ba = l_byteaInitFromMem(imdata, imbytes);
1476         if (imdata) LEPT_FREE(imdata);
1477         ptraAdd(pa_data, ba);
1478     }
1479     sarrayDestroy(&sa);
1480 
1481     ptraGetActualCount(pa_data, &npages);
1482     if (npages == 0) {
1483         L_ERROR("no pdf files made\n", procName);
1484         ptraDestroy(&pa_data, FALSE, FALSE);
1485         return 1;
1486     }
1487 
1488         /* Concatenate */
1489     ret = ptraConcatenatePdfToData(pa_data, NULL, &data, &databytes);
1490 
1491         /* Clean up */
1492     ptraGetActualCount(pa_data, &npages);  /* recalculate in case it changes */
1493     for (i = 0; i < npages; i++) {
1494         ba = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
1495         l_byteaDestroy(&ba);
1496     }
1497     ptraDestroy(&pa_data, FALSE, FALSE);
1498 
1499     if (ret) {
1500         if (data) LEPT_FREE(data);
1501         return ERROR_INT("pdf data not made", procName, 1);
1502     }
1503 
1504     ret = l_binaryWrite(fileout, "w", data, databytes);
1505     LEPT_FREE(data);
1506     if (ret)
1507         L_ERROR("pdf data not written to file\n", procName);
1508     return ret;
1509 }
1510 
1511 
1512 /*!
1513  * \brief   convertNumberedMasksToBoxaa()
1514  *
1515  * \param[in]    dirname directory name containing mask images
1516  * \param[in]    substr [optional] substring filter on filenames; can be NULL
1517  * \param[in]    numpre number of characters in name before number
1518  * \param[in]    numpost number of characters in name after number, up
1519  *                       to a dot before an extension
1520  *                       including an extension and the dot separator
1521  * \return  boxaa of mask regions, or NULL on error
1522  *
1523  * <pre>
1524  * Notes:
1525  *      (1) This is conveniently used to generate the input boxaa
1526  *          for convertSegmentedFilesToPdf().  It guarantees that the
1527  *          boxa will be aligned with the page images, even if some
1528  *          of the boxa are empty.
1529  * </pre>
1530  */
1531 BOXAA *
convertNumberedMasksToBoxaa(const char * dirname,const char * substr,l_int32 numpre,l_int32 numpost)1532 convertNumberedMasksToBoxaa(const char  *dirname,
1533                             const char  *substr,
1534                             l_int32      numpre,
1535                             l_int32      numpost)
1536 {
1537 char    *fname;
1538 l_int32  i, n;
1539 BOXA    *boxa;
1540 BOXAA   *baa;
1541 PIX     *pix;
1542 SARRAY  *sa;
1543 
1544     PROCNAME("convertNumberedMasksToBoxaa");
1545 
1546     if (!dirname)
1547         return (BOXAA *)ERROR_PTR("dirname not defined", procName, NULL);
1548 
1549     if ((sa = getNumberedPathnamesInDirectory(dirname, substr, numpre,
1550                                               numpost, 10000)) == NULL)
1551         return (BOXAA *)ERROR_PTR("sa not made", procName, NULL);
1552 
1553         /* Generate and save all the encoded pdf strings */
1554     n = sarrayGetCount(sa);
1555     baa = boxaaCreate(n);
1556     boxa = boxaCreate(1);
1557     boxaaInitFull(baa, boxa);
1558     boxaDestroy(&boxa);
1559     for (i = 0; i < n; i++) {
1560         fname = sarrayGetString(sa, i, L_NOCOPY);
1561         if (!strcmp(fname, "")) continue;
1562         if ((pix = pixRead(fname)) == NULL) {
1563             L_WARNING("invalid image on page %d\n", procName, i);
1564             continue;
1565         }
1566         boxa = pixConnComp(pix, NULL, 8);
1567         boxaaReplaceBoxa(baa, i, boxa);
1568         pixDestroy(&pix);
1569     }
1570 
1571     sarrayDestroy(&sa);
1572     return baa;
1573 }
1574 
1575 
1576 /*---------------------------------------------------------------------*
1577  *            Segmented single page, multi-image converters            *
1578  *---------------------------------------------------------------------*/
1579 /*!
1580  * \brief   convertToPdfSegmented()
1581  *
1582  * \param[in]    filein input image file -- any format
1583  * \param[in]    res input image resolution; typ. 300 ppi; use 0 for default
1584  * \param[in]    type compression type for non-image regions; the
1585  *                    image regions are always compressed with L_JPEG_ENCODE
1586  * \param[in]    thresh used for converting gray --> 1 bpp with L_G4_ENCODE
1587  * \param[in]    boxa [optional] of image regions; can be null
1588  * \param[in]    quality used for jpeg image regions; 0 for default
1589  * \param[in]    scalefactor used for jpeg regions; must be <= 1.0
1590  * \param[in]    title [optional] pdf title; typically taken from the
1591  *                     input file for the pix
1592  * \param[in]    fileout output pdf file
1593  * \return  0 if OK, 1 on error
1594  *
1595  * <pre>
1596  * Notes:
1597  *      (1) If there are no image regions, set %boxa == NULL;
1598  *          %quality and %scalefactor are ignored.
1599  *      (2) Typically, %scalefactor is < 1.0, because the image regions
1600  *          can be rendered at a lower resolution (for better compression)
1601  *          than the text regions.  If %scalefactor == 0, we use 1.0.
1602  *          If the input image is 1 bpp and scalefactor < 1.0, we
1603  *          use scaleToGray() to downsample the image regions to gray
1604  *          before compressing them.
1605  *      (3) If the compression type for non-image regions is L_G4_ENCODE
1606  *          and bpp > 1, the image is upscaled 2x and thresholded
1607  *          to 1 bpp.  That is the only situation where %thresh is used.
1608  *      (4) The parameter %quality is only used for image regions.
1609  *          If %type == L_JPEG_ENCODE, default jpeg quality (75) is
1610  *          used for the non-image regions.
1611  *      (5) Processing matrix for non-image regions.
1612  *
1613  *          Input           G4              JPEG                FLATE
1614  *          ----------|---------------------------------------------------
1615  *          1 bpp     |  1x, 1 bpp       1x flate, 1 bpp     1x, 1 bpp
1616  *                    |
1617  *          cmap      |  2x, 1 bpp       1x flate, cmap      1x, cmap
1618  *                    |
1619  *          2,4 bpp   |  2x, 1 bpp       1x flate            1x, 2,4 bpp
1620  *          no cmap   |                  2,4 bpp
1621  *                    |
1622  *          8,32 bpp  |  2x, 1 bpp       1x (jpeg)           1x, 8,32 bpp
1623  *          no cmap   |                  8,32 bpp
1624  *
1625  *          Summary:
1626  *          (a) if G4 is requested, G4 is used, with 2x upscaling
1627  *              for all cases except 1 bpp.
1628  *          (b) if JPEG is requested, use flate encoding for all cases
1629  *              except 8 bpp without cmap and 32 bpp (rgb).
1630  *          (c) if FLATE is requested, use flate with no transformation
1631  *              of the raster data.
1632  *      (6) Calling options/sequence for these functions:
1633  *              file  -->  file      (convertToPdfSegmented)
1634  *                  pix  -->  file      (pixConvertToPdfSegmented)
1635  *                      pix  -->  data      (pixConvertToPdfDataSegmented)
1636  *              file  -->  data      (convertToPdfDataSegmented)
1637  *                      pix  -->  data      (pixConvertToPdfDataSegmented)
1638  * </pre>
1639  */
1640 l_int32
convertToPdfSegmented(const char * filein,l_int32 res,l_int32 type,l_int32 thresh,BOXA * boxa,l_int32 quality,l_float32 scalefactor,const char * title,const char * fileout)1641 convertToPdfSegmented(const char  *filein,
1642                       l_int32      res,
1643                       l_int32      type,
1644                       l_int32      thresh,
1645                       BOXA        *boxa,
1646                       l_int32      quality,
1647                       l_float32    scalefactor,
1648                       const char  *title,
1649                       const char  *fileout)
1650 {
1651 l_int32  ret;
1652 PIX     *pixs;
1653 
1654     PROCNAME("convertToPdfSegmented");
1655 
1656     if (!filein)
1657         return ERROR_INT("filein not defined", procName, 1);
1658     if (!fileout)
1659         return ERROR_INT("fileout not defined", procName, 1);
1660     if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
1661         type != L_FLATE_ENCODE)
1662         return ERROR_INT("invalid conversion type", procName, 1);
1663     if (boxa && scalefactor > 1.0) {
1664         L_WARNING("setting scalefactor to 1.0\n", procName);
1665         scalefactor = 1.0;
1666     }
1667 
1668     if ((pixs = pixRead(filein)) == NULL)
1669         return ERROR_INT("pixs not made", procName, 1);
1670 
1671     ret = pixConvertToPdfSegmented(pixs, res, type, thresh, boxa, quality,
1672                                    scalefactor, (title) ? title : filein,
1673                                    fileout);
1674     pixDestroy(&pixs);
1675     return ret;
1676 }
1677 
1678 
1679 /*!
1680  * \brief   pixConvertToPdfSegmented()
1681  *
1682  * \param[in]    pixs any depth, cmap OK
1683  * \param[in]    res input image resolution; typ. 300 ppi; use 0 for default
1684  * \param[in]    type compression type for non-image regions; the
1685  *                    image regions are always compressed with L_JPEG_ENCODE
1686  * \param[in]    thresh used for converting gray --> 1 bpp with L_G4_ENCODE
1687  * \param[in]    boxa [optional] of image regions; can be null
1688  * \param[in]    quality used for jpeg image regions; 0 for default
1689  * \param[in]    scalefactor used for jpeg regions; must be <= 1.0
1690  * \param[in]    title [optional] pdf title; typically taken from the
1691  *                     input file for the pix
1692  * \param[in]    fileout output pdf file
1693  * \return  0 if OK, 1 on error
1694  *
1695  * <pre>
1696  * Notes:
1697  *      (1) See convertToPdfSegmented() for details.
1698  * </pre>
1699  */
1700 l_int32
pixConvertToPdfSegmented(PIX * pixs,l_int32 res,l_int32 type,l_int32 thresh,BOXA * boxa,l_int32 quality,l_float32 scalefactor,const char * title,const char * fileout)1701 pixConvertToPdfSegmented(PIX         *pixs,
1702                          l_int32      res,
1703                          l_int32      type,
1704                          l_int32      thresh,
1705                          BOXA        *boxa,
1706                          l_int32      quality,
1707                          l_float32    scalefactor,
1708                          const char  *title,
1709                          const char  *fileout)
1710 {
1711 l_uint8  *data;
1712 l_int32   ret;
1713 size_t    nbytes;
1714 
1715     PROCNAME("pixConvertToPdfSegmented");
1716 
1717     if (!pixs)
1718         return ERROR_INT("pixs not defined", procName, 1);
1719     if (!fileout)
1720         return ERROR_INT("fileout not defined", procName, 1);
1721     if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
1722         type != L_FLATE_ENCODE)
1723         return ERROR_INT("invalid conversion type", procName, 1);
1724     if (boxa && scalefactor > 1.0) {
1725         L_WARNING("setting scalefactor to 1.0\n", procName);
1726         scalefactor = 1.0;
1727     }
1728 
1729     ret = pixConvertToPdfDataSegmented(pixs, res, type, thresh, boxa, quality,
1730                                        scalefactor, title, &data, &nbytes);
1731     if (ret)
1732         return ERROR_INT("pdf generation failure", procName, 1);
1733 
1734     ret = l_binaryWrite(fileout, "w", data, nbytes);
1735     if (data) LEPT_FREE(data);
1736     return ret;
1737 }
1738 
1739 
1740 /*!
1741  * \brief   convertToPdfDataSegmented()
1742  *
1743  * \param[in]    filein input image file -- any format
1744  * \param[in]    res input image resolution; typ. 300 ppi; use 0 for default
1745  * \param[in]    type compression type for non-image regions; the
1746  *                    image regions are always compressed with L_JPEG_ENCODE
1747  * \param[in]    thresh used for converting gray --> 1 bpp with L_G4_ENCODE
1748  * \param[in]    boxa [optional] image regions; can be null
1749  * \param[in]    quality used for jpeg image regions; 0 for default
1750  * \param[in]    scalefactor used for jpeg regions; must be <= 1.0
1751  * \param[in]    title [optional] pdf title; if null, uses filein
1752  * \param[out]   pdata pdf data in memory
1753  * \param[out]   pnbytes number of bytes in pdf data
1754  * \return  0 if OK, 1 on error
1755  *
1756  * <pre>
1757  * Notes:
1758  *      (1) If there are no image regions, set %boxa == NULL;
1759  *          %quality and %scalefactor are ignored.
1760  *      (2) Typically, %scalefactor is < 1.0.  The image regions are
1761  * </pre>
1762  */
1763 l_int32
convertToPdfDataSegmented(const char * filein,l_int32 res,l_int32 type,l_int32 thresh,BOXA * boxa,l_int32 quality,l_float32 scalefactor,const char * title,l_uint8 ** pdata,size_t * pnbytes)1764 convertToPdfDataSegmented(const char  *filein,
1765                           l_int32      res,
1766                           l_int32      type,
1767                           l_int32      thresh,
1768                           BOXA        *boxa,
1769                           l_int32      quality,
1770                           l_float32    scalefactor,
1771                           const char  *title,
1772                           l_uint8    **pdata,
1773                           size_t      *pnbytes)
1774 {
1775 l_int32  ret;
1776 PIX     *pixs;
1777 
1778     PROCNAME("convertToPdfDataSegmented");
1779 
1780     if (!pdata)
1781         return ERROR_INT("&data not defined", procName, 1);
1782     *pdata = NULL;
1783     if (!pnbytes)
1784         return ERROR_INT("&nbytes not defined", procName, 1);
1785     *pnbytes = 0;
1786     if (!filein)
1787         return ERROR_INT("filein not defined", procName, 1);
1788     if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
1789         type != L_FLATE_ENCODE)
1790         return ERROR_INT("invalid conversion type", procName, 1);
1791     if (boxa && scalefactor > 1.0) {
1792         L_WARNING("setting scalefactor to 1.0\n", procName);
1793         scalefactor = 1.0;
1794     }
1795 
1796     if ((pixs = pixRead(filein)) == NULL)
1797         return ERROR_INT("pixs not made", procName, 1);
1798 
1799     ret = pixConvertToPdfDataSegmented(pixs, res, type, thresh, boxa,
1800                                        quality, scalefactor,
1801                                        (title) ? title : filein,
1802                                        pdata, pnbytes);
1803     pixDestroy(&pixs);
1804     return ret;
1805 }
1806 
1807 
1808 /*!
1809  * \brief   pixConvertToPdfDataSegmented()
1810  *
1811  * \param[in]    pixs any depth, cmap OK
1812  * \param[in]    res input image resolution; typ. 300 ppi; use 0 for default
1813  * \param[in]    type compression type for non-image regions; the
1814  *                    image regions are always compressed with L_JPEG_ENCODE
1815  * \param[in]    thresh used for converting gray --> 1 bpp with L_G4_ENCODE
1816  * \param[in]    boxa [optional] of image regions; can be null
1817  * \param[in]    quality used for jpeg image regions; 0 for default
1818  * \param[in]    scalefactor used for jpeg regions; must be <= 1.0
1819  * \param[in]    title [optional] pdf title; typically taken from the
1820  *                     input file for the pix
1821  * \param[out]   pdata pdf data in memory
1822  * \param[out]   pnbytes number of bytes in pdf data
1823  * \return  0 if OK, 1 on error
1824  *
1825  * <pre>
1826  * Notes:
1827  *      (1) See convertToPdfSegmented() for details.
1828  * </pre>
1829  */
1830 l_int32
pixConvertToPdfDataSegmented(PIX * pixs,l_int32 res,l_int32 type,l_int32 thresh,BOXA * boxa,l_int32 quality,l_float32 scalefactor,const char * title,l_uint8 ** pdata,size_t * pnbytes)1831 pixConvertToPdfDataSegmented(PIX         *pixs,
1832                              l_int32      res,
1833                              l_int32      type,
1834                              l_int32      thresh,
1835                              BOXA        *boxa,
1836                              l_int32      quality,
1837                              l_float32    scalefactor,
1838                              const char  *title,
1839                              l_uint8    **pdata,
1840                              size_t      *pnbytes)
1841 {
1842 l_int32      i, nbox, seq, bx, by, bw, bh, upscale;
1843 l_float32    scale;
1844 BOX         *box, *boxc, *box2;
1845 PIX         *pix, *pixt1, *pixt2, *pixt3, *pixt4, *pixt5, *pixt6;
1846 PIXCMAP     *cmap;
1847 L_PDF_DATA  *lpd;
1848 
1849     PROCNAME("pixConvertToPdfDataSegmented");
1850 
1851     if (!pdata)
1852         return ERROR_INT("&data not defined", procName, 1);
1853     *pdata = NULL;
1854     if (!pnbytes)
1855         return ERROR_INT("&nbytes not defined", procName, 1);
1856     *pnbytes = 0;
1857     if (!pixs)
1858         return ERROR_INT("pixs not defined", procName, 1);
1859     if (type != L_G4_ENCODE && type != L_JPEG_ENCODE &&
1860         type != L_FLATE_ENCODE)
1861         return ERROR_INT("invalid conversion type", procName, 1);
1862     if (boxa && (scalefactor <= 0.0 || scalefactor > 1.0)) {
1863         L_WARNING("setting scalefactor to 1.0\n", procName);
1864         scalefactor = 1.0;
1865     }
1866 
1867         /* Adjust scalefactor so that the product with res gives an integer */
1868     if (res <= 0)
1869         res = DEFAULT_INPUT_RES;
1870     scale = (l_float32)((l_int32)(scalefactor * res + 0.5)) / (l_float32)res;
1871     cmap = pixGetColormap(pixs);
1872 
1873         /* Simple case: single image to be encoded */
1874     if (!boxa || boxaGetCount(boxa) == 0) {
1875         if (pixGetDepth(pixs) > 1 && type == L_G4_ENCODE) {
1876             if (cmap)
1877                 pixt1 = pixRemoveColormap(pixs, REMOVE_CMAP_TO_GRAYSCALE);
1878             else
1879                 pixt1 = pixConvertTo8(pixs, FALSE);
1880             pixt2 = pixScaleGray2xLIThresh(pixt1, thresh);
1881             pixConvertToPdfData(pixt2, type, quality, pdata, pnbytes,
1882                                 0, 0, 2 * res, title, NULL, 0);
1883             pixDestroy(&pixt1);
1884             pixDestroy(&pixt2);
1885         } else {
1886             pixConvertToPdfData(pixs, type, quality, pdata, pnbytes,
1887                                 0, 0, res, title, NULL, 0);
1888         }
1889         return 0;
1890     }
1891 
1892         /* Multiple images to be encoded.  If %type == L_G4_ENCODE,
1893          * jpeg encode a version of pixs that is blanked in the non-image
1894          * regions, and paint the scaled non-image part onto it through a mask.
1895          * Otherwise, we must put the non-image part down first and
1896          * then render all the image regions separately on top of it,
1897          * at their own resolution. */
1898     pixt1 = pixSetBlackOrWhiteBoxa(pixs, boxa, L_SET_WHITE);  /* non-image */
1899     nbox = boxaGetCount(boxa);
1900     if (type == L_G4_ENCODE) {
1901         pixt2 = pixCreateTemplate(pixs);  /* only image regions */
1902         pixSetBlackOrWhite(pixt2, L_SET_WHITE);
1903         for (i = 0; i < nbox; i++) {
1904              box = boxaGetBox(boxa, i, L_CLONE);
1905              pix = pixClipRectangle(pixs, box, &boxc);
1906              boxGetGeometry(boxc, &bx, &by, &bw, &bh);
1907              pixRasterop(pixt2, bx, by, bw, bh, PIX_SRC, pix, 0, 0);
1908              pixDestroy(&pix);
1909              boxDestroy(&box);
1910              boxDestroy(&boxc);
1911         }
1912         pixt3 = pixRemoveColormap(pixt2, REMOVE_CMAP_BASED_ON_SRC);
1913         if (pixGetDepth(pixt3) == 1)
1914             pixt4 = pixScaleToGray(pixt3, scale);
1915         else
1916             pixt4 = pixScale(pixt3, scale, scale);
1917         pixConvertToPdfData(pixt4, L_JPEG_ENCODE, quality, pdata, pnbytes,
1918                             0, 0, (l_int32)(scale * res), title,
1919                             &lpd, L_FIRST_IMAGE);
1920 
1921         if (pixGetDepth(pixt1) == 1) {
1922             pixt5 = pixClone(pixt1);
1923             upscale = 1;
1924         } else {
1925             pixt6 = pixConvertTo8(pixt1, 0);
1926             pixt5 = pixScaleGray2xLIThresh(pixt6, thresh);
1927             pixDestroy(&pixt6);
1928             upscale = 2;
1929         }
1930         pixConvertToPdfData(pixt5, L_G4_ENCODE, quality, pdata, pnbytes,
1931                             0, 0, upscale * res, title, &lpd, L_LAST_IMAGE);
1932         pixDestroy(&pixt2);
1933         pixDestroy(&pixt3);
1934         pixDestroy(&pixt4);
1935         pixDestroy(&pixt5);
1936     } else {
1937             /* Put the non-image part down first.  This is the full
1938                size of the page, so we can use it to find the page
1939                height in pixels, which is required for determining
1940                the LL corner of the image relative to the LL corner
1941                of the page. */
1942         pixConvertToPdfData(pixt1, type, quality, pdata, pnbytes, 0, 0,
1943                             res, title, &lpd, L_FIRST_IMAGE);
1944         for (i = 0; i < nbox; i++) {
1945             box = boxaGetBox(boxa, i, L_CLONE);
1946             pixt2 = pixClipRectangle(pixs, box, &boxc);
1947             pixt3 = pixRemoveColormap(pixt2, REMOVE_CMAP_BASED_ON_SRC);
1948             if (pixGetDepth(pixt3) == 1)
1949                 pixt4 = pixScaleToGray(pixt3, scale);
1950             else
1951                 pixt4 = pixScale(pixt3, scale, scale);
1952             box2 = boxTransform(boxc, 0, 0, scale, scale);
1953             boxGetGeometry(box2, &bx, &by, NULL, &bh);
1954             seq = (i == nbox - 1) ? L_LAST_IMAGE : L_NEXT_IMAGE;
1955             pixConvertToPdfData(pixt4, L_JPEG_ENCODE, quality, pdata, pnbytes,
1956                                 bx, by, (l_int32)(scale * res), title,
1957                                 &lpd, seq);
1958             pixDestroy(&pixt2);
1959             pixDestroy(&pixt3);
1960             pixDestroy(&pixt4);
1961             boxDestroy(&box);
1962             boxDestroy(&boxc);
1963             boxDestroy(&box2);
1964         }
1965     }
1966 
1967     pixDestroy(&pixt1);
1968     return 0;
1969 }
1970 
1971 
1972 /*---------------------------------------------------------------------*
1973  *                         Multi-page concatenation                    *
1974  *---------------------------------------------------------------------*/
1975 /*!
1976  * \brief   concatenatePdf()
1977  *
1978  * \param[in]    dirname directory name containing single-page pdf files
1979  * \param[in]    substr [optional] substring filter on filenames; can be NULL
1980  * \param[in]    fileout concatenated pdf file
1981  * \return  0 if OK, 1 on error
1982  *
1983  * <pre>
1984  * Notes:
1985  *      (1) This only works with leptonica-formatted single-page pdf files.
1986  *      (2) If %substr is not NULL, only filenames that contain
1987  *          the substring can be returned.  If %substr == NULL,
1988  *          none of the filenames are filtered out.
1989  *      (3) The files in the directory, after optional filtering by
1990  *          the substring, are lexically sorted in increasing order
1991  *          before concatenation.
1992  * </pre>
1993  */
1994 l_int32
concatenatePdf(const char * dirname,const char * substr,const char * fileout)1995 concatenatePdf(const char  *dirname,
1996                const char  *substr,
1997                const char  *fileout)
1998 {
1999 l_int32  ret;
2000 SARRAY  *sa;
2001 
2002     PROCNAME("concatenatePdf");
2003 
2004     if (!dirname)
2005         return ERROR_INT("dirname not defined", procName, 1);
2006     if (!fileout)
2007         return ERROR_INT("fileout not defined", procName, 1);
2008 
2009     if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
2010         return ERROR_INT("sa not made", procName, 1);
2011     ret = saConcatenatePdf(sa, fileout);
2012     sarrayDestroy(&sa);
2013     return ret;
2014 }
2015 
2016 
2017 /*!
2018  * \brief   saConcatenatePdf()
2019  *
2020  * \param[in]    sa string array of pathnames for single-page pdf files
2021  * \param[in]    fileout concatenated pdf file
2022  * \return  0 if OK, 1 on error
2023  *
2024  * <pre>
2025  * Notes:
2026  *      (1) This only works with leptonica-formatted single-page pdf files.
2027  * </pre>
2028  */
2029 l_int32
saConcatenatePdf(SARRAY * sa,const char * fileout)2030 saConcatenatePdf(SARRAY      *sa,
2031                  const char  *fileout)
2032 {
2033 l_uint8  *data;
2034 l_int32   ret;
2035 size_t    nbytes;
2036 
2037     PROCNAME("saConcatenatePdf");
2038 
2039     if (!sa)
2040         return ERROR_INT("sa not defined", procName, 1);
2041     if (!fileout)
2042         return ERROR_INT("fileout not defined", procName, 1);
2043 
2044     ret = saConcatenatePdfToData(sa, &data, &nbytes);
2045     if (ret)
2046         return ERROR_INT("pdf data not made", procName, 1);
2047     ret = l_binaryWrite(fileout, "w", data, nbytes);
2048     LEPT_FREE(data);
2049     return ret;
2050 }
2051 
2052 
2053 /*!
2054  * \brief   ptraConcatenatePdf()
2055  *
2056  * \param[in]    pa array of pdf strings, each for a single-page pdf file
2057  * \param[in]    fileout concatenated pdf file
2058  * \return  0 if OK, 1 on error
2059  *
2060  * <pre>
2061  * Notes:
2062  *      (1) This only works with leptonica-formatted single-page pdf files.
2063  * </pre>
2064  */
2065 l_int32
ptraConcatenatePdf(L_PTRA * pa,const char * fileout)2066 ptraConcatenatePdf(L_PTRA      *pa,
2067                    const char  *fileout)
2068 {
2069 l_uint8  *data;
2070 l_int32   ret;
2071 size_t    nbytes;
2072 
2073     PROCNAME("ptraConcatenatePdf");
2074 
2075     if (!pa)
2076         return ERROR_INT("pa not defined", procName, 1);
2077     if (!fileout)
2078         return ERROR_INT("fileout not defined", procName, 1);
2079 
2080     ret = ptraConcatenatePdfToData(pa, NULL, &data, &nbytes);
2081     if (ret)
2082         return ERROR_INT("pdf data not made", procName, 1);
2083     ret = l_binaryWrite(fileout, "w", data, nbytes);
2084     LEPT_FREE(data);
2085     return ret;
2086 }
2087 
2088 
2089 /*!
2090  * \brief   concatenatePdfToData()
2091  *
2092  * \param[in]    dirname directory name containing single-page pdf files
2093  * \param[in]    substr [optional] substring filter on filenames; can be NULL
2094  * \param[out]   pdata concatenated pdf data in memory
2095  * \param[out]   pnbytes number of bytes in pdf data
2096  * \return  0 if OK, 1 on error
2097  *
2098  * <pre>
2099  * Notes:
2100  *      (1) This only works with leptonica-formatted single-page pdf files.
2101  *      (2) If %substr is not NULL, only filenames that contain
2102  *          the substring can be returned.  If %substr == NULL,
2103  *          none of the filenames are filtered out.
2104  *      (3) The files in the directory, after optional filtering by
2105  *          the substring, are lexically sorted in increasing order
2106  *          before concatenation.
2107  * </pre>
2108  */
2109 l_int32
concatenatePdfToData(const char * dirname,const char * substr,l_uint8 ** pdata,size_t * pnbytes)2110 concatenatePdfToData(const char  *dirname,
2111                      const char  *substr,
2112                      l_uint8    **pdata,
2113                      size_t      *pnbytes)
2114 {
2115 l_int32  ret;
2116 SARRAY  *sa;
2117 
2118     PROCNAME("concatenatePdfToData");
2119 
2120     if (!pdata)
2121         return ERROR_INT("&data not defined", procName, 1);
2122     *pdata = NULL;
2123     if (!pnbytes)
2124         return ERROR_INT("&nbytes not defined", procName, 1);
2125     *pnbytes = 0;
2126     if (!dirname)
2127         return ERROR_INT("dirname not defined", procName, 1);
2128 
2129     if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
2130         return ERROR_INT("sa not made", procName, 1);
2131     ret = saConcatenatePdfToData(sa, pdata, pnbytes);
2132     sarrayDestroy(&sa);
2133     return ret;
2134 }
2135 
2136 
2137 /*!
2138  * \brief   saConcatenatePdfToData()
2139  *
2140  * \param[in]    sa string array of pathnames for single-page pdf files
2141  * \param[out]   pdata concatenated pdf data in memory
2142  * \param[out]   pnbytes number of bytes in pdf data
2143  * \return  0 if OK, 1 on error
2144  *
2145  * <pre>
2146  * Notes:
2147  *      (1) This only works with leptonica-formatted single-page pdf files.
2148  * </pre>
2149  */
2150 l_int32
saConcatenatePdfToData(SARRAY * sa,l_uint8 ** pdata,size_t * pnbytes)2151 saConcatenatePdfToData(SARRAY    *sa,
2152                        l_uint8  **pdata,
2153                        size_t    *pnbytes)
2154 {
2155 char     *fname;
2156 l_int32   i, npages, ret;
2157 L_BYTEA  *bas;
2158 L_PTRA   *pa_data;  /* input pdf data for each page */
2159 
2160     PROCNAME("saConcatenatePdfToData");
2161 
2162     if (!pdata)
2163         return ERROR_INT("&data not defined", procName, 1);
2164     *pdata = NULL;
2165     if (!pnbytes)
2166         return ERROR_INT("&nbytes not defined", procName, 1);
2167     *pnbytes = 0;
2168     if (!sa)
2169         return ERROR_INT("sa not defined", procName, 1);
2170 
2171         /* Read the pdf files into memory */
2172     if ((npages = sarrayGetCount(sa)) == 0)
2173         return ERROR_INT("no filenames found", procName, 1);
2174     pa_data = ptraCreate(npages);
2175     for (i = 0; i < npages; i++) {
2176         fname = sarrayGetString(sa, i, L_NOCOPY);
2177         bas = l_byteaInitFromFile(fname);
2178         ptraAdd(pa_data, bas);
2179     }
2180 
2181     ret = ptraConcatenatePdfToData(pa_data, sa, pdata, pnbytes);
2182 
2183         /* Cleanup: some pages could have been removed */
2184     ptraGetActualCount(pa_data, &npages);
2185     for (i = 0; i < npages; i++) {
2186         bas = (L_BYTEA *)ptraRemove(pa_data, i, L_NO_COMPACTION);
2187         l_byteaDestroy(&bas);
2188     }
2189     ptraDestroy(&pa_data, FALSE, FALSE);
2190     return ret;
2191 }
2192 
2193 /* --------------------------------------------*/
2194 #endif  /* USE_PDFIO */
2195 /* --------------------------------------------*/
2196