1 /*====================================================================*
2  -  Copyright (C) 2001 Leptonica.  All rights reserved.
3  -
4  -  Redistribution and use in source and binary forms, with or without
5  -  modification, are permitted provided that the following conditions
6  -  are met:
7  -  1. Redistributions of source code must retain the above copyright
8  -     notice, this list of conditions and the following disclaimer.
9  -  2. Redistributions in binary form must reproduce the above
10  -     copyright notice, this list of conditions and the following
11  -     disclaimer in the documentation and/or other materials
12  -     provided with the distribution.
13  -
14  -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15  -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16  -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17  -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
18  -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23  -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *====================================================================*/
26 
27 /*!
28  * \file psio1.c
29  * <pre>
30  *
31  *    |=============================================================|
32  *    |                         Important note                      |
33  *    |=============================================================|
34  *    | Some of these functions require libtiff, libjpeg and libz.  |
35  *    | If you do not have these libraries, you must set            |
36  *    |     #define  USE_PSIO     0                                 |
37  *    | in environ.h.  This will link psio1stub.c                   |
38  *    |=============================================================|
39  *
40  *     This is a PostScript "device driver" for wrapping images
41  *     in PostScript.  The images can be rendered by a PostScript
42  *     interpreter for viewing, using evince or gv.  They can also be
43  *     rasterized for printing, using gs or an embedded interpreter
44  *     in a PostScript printer.  And they can be converted to a pdf
45  *     using gs (ps2pdf).
46  *
47  *     Convert specified files to PS
48  *          l_int32          convertFilesToPS()
49  *          l_int32          sarrayConvertFilesToPS()
50  *          l_int32          convertFilesFittedToPS()
51  *          l_int32          sarrayConvertFilesFittedToPS()
52  *          l_int32          writeImageCompressedToPSFile()
53  *
54  *     Convert mixed text/image files to PS
55  *          l_int32          convertSegmentedPagesToPS()
56  *          l_int32          pixWriteSegmentedPageToPS()
57  *          l_int32          pixWriteMixedToPS()
58  *
59  *     Convert any image file to PS for embedding
60  *          l_int32          convertToPSEmbed()
61  *
62  *     Write all images in a pixa out to PS
63  *          l_int32          pixaWriteCompressedToPS()
64  *
65  *  These PostScript converters are used in three different ways.
66  *
67  *  (1) For embedding a PS file in a program like TeX.
68  *      convertToPSEmbed() handles this for levels 1, 2 and 3 output,
69  *      and prog/converttops wraps this in an executable.
70  *      converttops is a generalization of Thomas Merz's jpeg2ps wrapper,
71  *      in that it works for all types (formats, depth, colormap)
72  *      of input images and gives PS output in one of these formats
73  *        * level 1 (uncompressed)
74  *        * level 2 (compressed ccittg4 or dct)
75  *        * level 3 (compressed flate)
76  *
77  *  (2) For composing a set of pages with any number of images
78  *      painted on them, in either level 2 or level 3 formats.
79  *
80  *  (3) For printing a page image or a set of page images, at a
81  *      resolution that optimally fills the page, using
82  *      convertFilesFittedToPS().
83  *
84  *  The top-level calls of utilities in category 2, which can compose
85  *  multiple images on a page, and which generate a PostScript file for
86  *  printing or display (e.g., conversion to pdf), are:
87  *      convertFilesToPS()
88  *      convertFilesFittedToPS()
89  *      convertSegmentedPagesToPS()
90  *
91  *  All images are output with page numbers.  Bounding box hints are
92  *  more subtle.  They must be included for embeding images in
93  *  TeX, for example, and the low-level writers include bounding
94  *  box hints by default.  However, these hints should not be included for
95  *  multi-page PostScript that is composed of a sequence of images;
96  *  consequently, they are not written when calling higher level
97  *  functions such as convertFilesToPS(), convertFilesFittedToPS()
98  *  and convertSegmentedPagesToPS().  The function l_psWriteBoundingBox()
99  *  sets a flag to give low-level control over this.
100  * </pre>
101  */
102 
103 #include <string.h>
104 #include "allheaders.h"
105 
106 /* --------------------------------------------*/
107 #if  USE_PSIO   /* defined in environ.h */
108  /* --------------------------------------------*/
109 
110 /*-------------------------------------------------------------*
111  *                Convert files in a directory to PS           *
112  *-------------------------------------------------------------*/
113 /*
114  *  convertFilesToPS()
115  *
116  *      Input:  dirin (input directory)
117  *              substr (<optional> substring filter on filenames; can be NULL)
118  *              res (typ. 300 or 600 ppi)
119  *              fileout (output ps file)
120  *      Return: 0 if OK, 1 on error
121  *
122  *  Notes:
123  *      (1) This generates a PS file for all image files in a specified
124  *          directory that contain the substr pattern to be matched.
125  *      (2) Each image is written to a separate page in the output PS file.
126  *      (3) All images are written compressed:
127  *              * if tiffg4  -->  use ccittg4
128  *              * if jpeg    -->  use dct
129  *              * all others -->  use flate
130  *          If the image is jpeg or tiffg4, we use the existing compressed
131  *          strings for the encoding; otherwise, we read the image into
132  *          a pix and flate-encode the pieces.
133  *      (4) The resolution is often confusing.  It is interpreted
134  *          as the resolution of the output display device:  "If the
135  *          input image were digitized at 300 ppi, what would it
136  *          look like when displayed at res ppi."  So, for example,
137  *          if res = 100 ppi, then the display pixels are 3x larger
138  *          than the 300 ppi pixels, and the image will be rendered
139  *          3x larger.
140  *      (5) The size of the PostScript file is independent of the resolution,
141  *          because the entire file is encoded.  The res parameter just
142  *          tells the PS decomposer how to render the page.  Therefore,
143  *          for minimum file size without loss of visual information,
144  *          if the output res is less than 300, you should downscale
145  *          the image to the output resolution before wrapping in PS.
146  *      (6) The "canvas" on which the image is rendered, at the given
147  *          output resolution, is a standard page size (8.5 x 11 in).
148  */
149 l_int32
convertFilesToPS(const char * dirin,const char * substr,l_int32 res,const char * fileout)150 convertFilesToPS(const char  *dirin,
151                  const char  *substr,
152                  l_int32      res,
153                  const char  *fileout)
154 {
155 SARRAY  *sa;
156 
157     PROCNAME("convertFilesToPS");
158 
159     if (!dirin)
160         return ERROR_INT("dirin not defined", procName, 1);
161     if (!fileout)
162         return ERROR_INT("fileout not defined", procName, 1);
163     if (res <= 0) {
164         L_INFO("setting res to 300 ppi\n", procName);
165         res = 300;
166     }
167     if (res < 10 || res > 4000)
168         L_WARNING("res is typically in the range 300-600 ppi\n", procName);
169 
170         /* Get all filtered and sorted full pathnames. */
171     sa = getSortedPathnamesInDirectory(dirin, substr, 0, 0);
172 
173         /* Generate the PS file.  Don't use bounding boxes. */
174     l_psWriteBoundingBox(FALSE);
175     sarrayConvertFilesToPS(sa, res, fileout);
176     l_psWriteBoundingBox(TRUE);
177     sarrayDestroy(&sa);
178     return 0;
179 }
180 
181 
182 /*
183  *  sarrayConvertFilesToPS()
184  *
185  *      Input:  sarray (of full path names)
186  *              res (typ. 300 or 600 ppi)
187  *              fileout (output ps file)
188  *      Return: 0 if OK, 1 on error
189  *
190  *  Notes:
191  *      (1) See convertFilesToPS()
192  */
193 l_int32
sarrayConvertFilesToPS(SARRAY * sa,l_int32 res,const char * fileout)194 sarrayConvertFilesToPS(SARRAY      *sa,
195                        l_int32      res,
196                        const char  *fileout)
197 {
198 char    *fname;
199 l_int32  i, nfiles, index, firstfile, ret, format;
200 
201     PROCNAME("sarrayConvertFilesToPS");
202 
203     if (!sa)
204         return ERROR_INT("sa not defined", procName, 1);
205     if (!fileout)
206         return ERROR_INT("fileout not defined", procName, 1);
207     if (res <= 0) {
208         L_INFO("setting res to 300 ppi\n", procName);
209         res = 300;
210     }
211     if (res < 10 || res > 4000)
212         L_WARNING("res is typically in the range 300-600 ppi\n", procName);
213 
214     nfiles = sarrayGetCount(sa);
215     firstfile = TRUE;
216     for (i = 0, index = 0; i < nfiles; i++) {
217         fname = sarrayGetString(sa, i, L_NOCOPY);
218         ret = pixReadHeader(fname, &format, NULL, NULL, NULL, NULL, NULL);
219         if (ret) continue;
220         if (format == IFF_UNKNOWN)
221             continue;
222 
223         writeImageCompressedToPSFile(fname, fileout, res, &firstfile, &index);
224     }
225 
226     return 0;
227 }
228 
229 
230 /*
231  *  convertFilesFittedToPS()
232  *
233  *      Input:  dirin (input directory)
234  *              substr (<optional> substring filter on filenames; can be NULL)
235  *              xpts, ypts (desired size in printer points; use 0 for default)
236  *              fileout (output ps file)
237  *      Return: 0 if OK, 1 on error
238  *
239  *  Notes:
240  *      (1) This generates a PS file for all files in a specified directory
241  *          that contain the substr pattern to be matched.
242  *      (2) Each image is written to a separate page in the output PS file.
243  *      (3) All images are written compressed:
244  *              * if tiffg4  -->  use ccittg4
245  *              * if jpeg    -->  use dct
246  *              * all others -->  use flate
247  *          If the image is jpeg or tiffg4, we use the existing compressed
248  *          strings for the encoding; otherwise, we read the image into
249  *          a pix and flate-encode the pieces.
250  *      (4) The resolution is internally determined such that the images
251  *          are rendered, in at least one direction, at 100% of the given
252  *          size in printer points.  Use 0.0 for xpts or ypts to get
253  *          the default value, which is 612.0 or 792.0, rsp.
254  *      (5) The size of the PostScript file is independent of the resolution,
255  *          because the entire file is encoded.  The %xpts and %ypts
256  *          parameter tells the PS decomposer how to render the page.
257  */
258 l_int32
convertFilesFittedToPS(const char * dirin,const char * substr,l_float32 xpts,l_float32 ypts,const char * fileout)259 convertFilesFittedToPS(const char  *dirin,
260                        const char  *substr,
261                        l_float32    xpts,
262                        l_float32    ypts,
263                        const char  *fileout)
264 {
265 SARRAY  *sa;
266 
267     PROCNAME("convertFilesFittedToPS");
268 
269     if (!dirin)
270         return ERROR_INT("dirin not defined", procName, 1);
271     if (!fileout)
272         return ERROR_INT("fileout not defined", procName, 1);
273     if (xpts <= 0.0) {
274         L_INFO("setting xpts to 612.0 ppi\n", procName);
275         xpts = 612.0;
276     }
277     if (ypts <= 0.0) {
278         L_INFO("setting ypts to 792.0 ppi\n", procName);
279         ypts = 792.0;
280     }
281     if (xpts < 100.0 || xpts > 2000.0 || ypts < 100.0 || ypts > 2000.0)
282         L_WARNING("xpts,ypts are typically in the range 500-800\n", procName);
283 
284         /* Get all filtered and sorted full pathnames. */
285     sa = getSortedPathnamesInDirectory(dirin, substr, 0, 0);
286 
287         /* Generate the PS file.  Don't use bounding boxes. */
288     l_psWriteBoundingBox(FALSE);
289     sarrayConvertFilesFittedToPS(sa, xpts, ypts, fileout);
290     l_psWriteBoundingBox(TRUE);
291     sarrayDestroy(&sa);
292     return 0;
293 }
294 
295 
296 /*
297  *  sarrayConvertFilesFittedToPS()
298  *
299  *      Input:  sarray (of full path names)
300  *              xpts, ypts (desired size in printer points; use 0 for default)
301  *              fileout (output ps file)
302  *      Return: 0 if OK, 1 on error
303  *
304  *  Notes:
305  *      (1) See convertFilesFittedToPS()
306  */
307 l_int32
sarrayConvertFilesFittedToPS(SARRAY * sa,l_float32 xpts,l_float32 ypts,const char * fileout)308 sarrayConvertFilesFittedToPS(SARRAY      *sa,
309                              l_float32    xpts,
310                              l_float32    ypts,
311                              const char  *fileout)
312 {
313 char    *fname;
314 l_int32  ret, i, w, h, nfiles, index, firstfile, format, res;
315 
316     PROCNAME("sarrayConvertFilesFittedToPS");
317 
318     if (!sa)
319         return ERROR_INT("sa not defined", procName, 1);
320     if (!fileout)
321         return ERROR_INT("fileout not defined", procName, 1);
322     if (xpts <= 0.0) {
323         L_INFO("setting xpts to 612.0\n", procName);
324         xpts = 612.0;
325     }
326     if (ypts <= 0.0) {
327         L_INFO("setting ypts to 792.0\n", procName);
328         ypts = 792.0;
329     }
330     if (xpts < 100.0 || xpts > 2000.0 || ypts < 100.0 || ypts > 2000.0)
331         L_WARNING("xpts,ypts are typically in the range 500-800\n", procName);
332 
333     nfiles = sarrayGetCount(sa);
334     firstfile = TRUE;
335     for (i = 0, index = 0; i < nfiles; i++) {
336         fname = sarrayGetString(sa, i, L_NOCOPY);
337         ret = pixReadHeader(fname, &format, &w, &h, NULL, NULL, NULL);
338         if (ret) continue;
339         if (format == IFF_UNKNOWN)
340             continue;
341 
342             /* Be sure the entire image is wrapped */
343         if (xpts * h < ypts * w)
344             res = (l_int32)((l_float32)w * 72.0 / xpts);
345         else
346             res = (l_int32)((l_float32)h * 72.0 / ypts);
347 
348         writeImageCompressedToPSFile(fname, fileout, res, &firstfile, &index);
349     }
350 
351     return 0;
352 }
353 
354 
355 /*
356  *  writeImageCompressedToPSFile()
357  *
358  *      Input:  filein (input image file)
359  *              fileout (output ps file)
360  *              res (output printer resolution)
361  *              &firstfile (<input and return> 1 if the first image;
362  *                          0 otherwise)
363  *              &index (<input and return> index of image in output ps file)
364  *      Return: 0 if OK, 1 on error
365  *
366  *  Notes:
367  *      (1) This wraps a single page image in PS.
368  *      (2) The input file can be in any format.  It is compressed as follows:
369  *             * if in tiffg4  -->  use ccittg4
370  *             * if in jpeg    -->  use dct
371  *             * all others    -->  use flate
372  *      (3) Before the first call, set %firstpage = 1.  After writing
373  *          the first page, it will be set to 0.
374  *      (4) %index is incremented if the page is successfully written.
375  */
376 l_int32
writeImageCompressedToPSFile(const char * filein,const char * fileout,l_int32 res,l_int32 * pfirstfile,l_int32 * pindex)377 writeImageCompressedToPSFile(const char  *filein,
378                              const char  *fileout,
379                              l_int32      res,
380                              l_int32     *pfirstfile,
381                              l_int32     *pindex)
382 {
383 const char  *op;
384 l_int32      format, retval;
385 
386     PROCNAME("writeImageCompressedToPSFile");
387 
388     if (!pfirstfile || !pindex)
389         return ERROR_INT("&firstfile and &index not defined", procName, 1);
390 
391     findFileFormat(filein, &format);
392     if (format == IFF_UNKNOWN) {
393         L_ERROR("format of %s not known\n", procName, filein);
394         return 1;
395     }
396 
397     op = (*pfirstfile == TRUE) ? "w" : "a";
398     if (format == IFF_JFIF_JPEG) {
399         retval = convertJpegToPS(filein, fileout, op, 0, 0,
400                                  res, 1.0, *pindex + 1, TRUE);
401         if (retval == 0) {
402             *pfirstfile = FALSE;
403             (*pindex)++;
404         }
405     } else if (format == IFF_TIFF_G4) {
406         retval = convertG4ToPS(filein, fileout, op, 0, 0,
407                                res, 1.0, *pindex + 1, FALSE, TRUE);
408         if (retval == 0) {
409             *pfirstfile = FALSE;
410             (*pindex)++;
411         }
412     } else {  /* all other image formats */
413         retval = convertFlateToPS(filein, fileout, op, 0, 0,
414                                   res, 1.0, *pindex + 1, TRUE);
415         if (retval == 0) {
416             *pfirstfile = FALSE;
417             (*pindex)++;
418         }
419     }
420 
421     return retval;
422 }
423 
424 
425 /*-------------------------------------------------------------*
426  *              Convert mixed text/image files to PS           *
427  *-------------------------------------------------------------*/
428 /*
429  *  convertSegmentedPagesToPS()
430  *
431  *      Input:  pagedir (input page image directory)
432  *              pagestr (<optional> substring filter on page filenames;
433  *                       can be NULL)
434  *              page_numpre (number of characters in page name before number)
435  *              maskdir (input mask image directory)
436  *              maskstr (<optional> substring filter on mask filenames;
437  *                       can be NULL)
438  *              mask_numpre (number of characters in mask name before number)
439  *              numpost (number of characters in names after number)
440  *              maxnum (only consider page numbers up to this value)
441  *              textscale (scale of text output relative to pixs)
442  *              imagescale (scale of image output relative to pixs)
443  *              threshold (for binarization; typ. about 190; 0 for default)
444  *              fileout (output ps file)
445  *      Return: 0 if OK, 1 on error
446  *
447  *  Notes:
448  *      (1) This generates a PS file for all page image and mask files in two
449  *          specified directories and that contain the page numbers as
450  *          specified below.  The two directories can be the same, in which
451  *          case the page and mask files are differentiated by the two
452  *          substrings for string matches.
453  *      (2) The page images are taken in lexicographic order.
454  *          Mask images whose numbers match the page images are used to
455  *          segment the page images.  Page images without a matching
456  *          mask image are scaled, thresholded and rendered entirely as text.
457  *      (3) Each PS page is generated as a compressed representation of
458  *          the page image, where the part of the image under the mask
459  *          is suitably scaled and compressed as DCT (i.e., jpeg), and
460  *          the remaining part of the page is suitably scaled, thresholded,
461  *          compressed as G4 (i.e., tiff g4), and rendered by painting
462  *          black through the resulting text mask.
463  *      (4) The scaling is typically 2x down for the DCT component
464  *          (%imagescale = 0.5) and 2x up for the G4 component
465  *          (%textscale = 2.0).
466  *      (5) The resolution is automatically set to fit to a
467  *          letter-size (8.5 x 11 inch) page.
468  *      (6) Both the DCT and the G4 encoding are PostScript level 2.
469  *      (7) It is assumed that the page number is contained within
470  *          the basename (the filename without directory or extension).
471  *          %page_numpre is the number of characters in the page basename
472  *          preceding the actual page number; %mask_numpre is likewise for
473  *          the mask basename; %numpost is the number of characters
474  *          following the page number.  For example, for mask name
475  *          mask_006.tif, mask_numpre = 5 ("mask_).
476  *      (8) To render a page as is -- that is, with no thresholding
477  *          of any pixels -- use a mask in the mask directory that is
478  *          full size with all pixels set to 1.  If the page is 1 bpp,
479  *          it is not necessary to have a mask.
480  */
481 l_int32
convertSegmentedPagesToPS(const char * pagedir,const char * pagestr,l_int32 page_numpre,const char * maskdir,const char * maskstr,l_int32 mask_numpre,l_int32 numpost,l_int32 maxnum,l_float32 textscale,l_float32 imagescale,l_int32 threshold,const char * fileout)482 convertSegmentedPagesToPS(const char  *pagedir,
483                           const char  *pagestr,
484                           l_int32      page_numpre,
485                           const char  *maskdir,
486                           const char  *maskstr,
487                           l_int32      mask_numpre,
488                           l_int32      numpost,
489                           l_int32      maxnum,
490                           l_float32    textscale,
491                           l_float32    imagescale,
492                           l_int32      threshold,
493                           const char  *fileout)
494 {
495 l_int32  pageno, i, npages;
496 PIX     *pixs, *pixm;
497 SARRAY  *sapage, *samask;
498 
499     PROCNAME("convertSegmentedPagesToPS");
500 
501     if (!pagedir)
502         return ERROR_INT("pagedir not defined", procName, 1);
503     if (!maskdir)
504         return ERROR_INT("maskdir not defined", procName, 1);
505     if (!fileout)
506         return ERROR_INT("fileout not defined", procName, 1);
507     if (threshold <= 0) {
508         L_INFO("setting threshold to 190\n", procName);
509         threshold = 190;
510     }
511 
512         /* Get numbered full pathnames; max size of sarray is maxnum */
513     sapage = getNumberedPathnamesInDirectory(pagedir, pagestr,
514                                              page_numpre, numpost, maxnum);
515     samask = getNumberedPathnamesInDirectory(maskdir, maskstr,
516                                              mask_numpre, numpost, maxnum);
517     sarrayPadToSameSize(sapage, samask, (char *)"");
518     if ((npages = sarrayGetCount(sapage)) == 0) {
519         sarrayDestroy(&sapage);
520         sarrayDestroy(&samask);
521         return ERROR_INT("no matching pages found", procName, 1);
522     }
523 
524         /* Generate the PS file */
525     pageno = 1;
526     for (i = 0; i < npages; i++) {
527         if ((pixs = pixReadIndexed(sapage, i)) == NULL)
528             continue;
529         pixm = pixReadIndexed(samask, i);
530         pixWriteSegmentedPageToPS(pixs, pixm, textscale, imagescale,
531                                   threshold, pageno, fileout);
532         pixDestroy(&pixs);
533         pixDestroy(&pixm);
534         pageno++;
535     }
536 
537     sarrayDestroy(&sapage);
538     sarrayDestroy(&samask);
539     return 0;
540 }
541 
542 
543 /*
544  *  pixWriteSegmentedPageToPS()
545  *
546  *      Input:  pixs (all depths; colormap ok)
547  *              pixm (<optional> 1 bpp segmentation mask over image region)
548  *              textscale (scale of text output relative to pixs)
549  *              imagescale (scale of image output relative to pixs)
550  *              threshold (threshold for binarization; typ. 190)
551  *              pageno (page number in set; use 1 for new output file)
552  *              fileout (output ps file)
553  *      Return: 0 if OK, 1 on error
554  *
555  *  Notes:
556  *      (1) This generates the PS string for a mixed text/image page,
557  *          and adds it to an existing file if %pageno > 1.
558  *          The PS output is determined by fitting the result to
559  *          a letter-size (8.5 x 11 inch) page.
560  *      (2) The two images (pixs and pixm) are at the same resolution
561  *          (typically 300 ppi).  They are used to generate two compressed
562  *          images, pixb and pixc, that are put directly into the output
563  *          PS file.
564  *      (3) pixb is the text component.  In the PostScript world, we think of
565  *          it as a mask through which we paint black.  It is produced by
566  *          scaling pixs by %textscale, and thresholding to 1 bpp.
567  *      (4) pixc is the image component, which is that part of pixs under
568  *          the mask pixm.  It is scaled from pixs by %imagescale.
569  *      (5) Typical values are textscale = 2.0 and imagescale = 0.5.
570  *      (6) If pixm == NULL, the page has only text.  If it is all black,
571  *          the page is all image and has no text.
572  *      (7) This can be used to write a multi-page PS file, by using
573  *          sequential page numbers with the same output file.  It can
574  *          also be used to write separate PS files for each page,
575  *          by using different output files with %pageno = 0 or 1.
576  */
577 l_int32
pixWriteSegmentedPageToPS(PIX * pixs,PIX * pixm,l_float32 textscale,l_float32 imagescale,l_int32 threshold,l_int32 pageno,const char * fileout)578 pixWriteSegmentedPageToPS(PIX         *pixs,
579                           PIX         *pixm,
580                           l_float32    textscale,
581                           l_float32    imagescale,
582                           l_int32      threshold,
583                           l_int32      pageno,
584                           const char  *fileout)
585 {
586 l_int32    alltext, notext, d, ret;
587 l_uint32   val;
588 l_float32  scaleratio;
589 PIX       *pixmi, *pixmis, *pixt, *pixg, *pixsc, *pixb, *pixc;
590 
591     PROCNAME("pixWriteSegmentedPageToPS");
592 
593     if (!pixs)
594         return ERROR_INT("pixs not defined", procName, 1);
595     if (!fileout)
596         return ERROR_INT("fileout not defined", procName, 1);
597     if (imagescale <= 0.0 || textscale <= 0.0)
598         return ERROR_INT("relative scales must be > 0.0", procName, 1);
599 
600         /* Analyze the page.  Determine the ratio by which the
601          * binary text mask is scaled relative to the image part.
602          * If there is no image region (alltext == TRUE), the
603          * text mask will be rendered directly to fit the page,
604          * and scaleratio = 1.0.  */
605     alltext = TRUE;
606     notext = FALSE;
607     scaleratio = 1.0;
608     if (pixm) {
609         pixZero(pixm, &alltext);  /* pixm empty: all text */
610         if (alltext) {
611             pixm = NULL;  /* treat it as not existing here */
612         } else {
613             pixmi = pixInvert(NULL, pixm);
614             pixZero(pixmi, &notext);  /* pixm full; no text */
615             pixDestroy(&pixmi);
616             scaleratio = textscale / imagescale;
617         }
618     }
619 
620     if (pixGetDepth(pixs) == 1) {  /* render tiff g4 */
621         pixb = pixClone(pixs);
622         pixc = NULL;
623     } else {
624         pixt = pixConvertTo8Or32(pixs, L_CLONE, 0);  /* clone if possible */
625 
626             /* Get the binary text mask.  Note that pixg cannot be a
627              * clone of pixs, because it may be altered by pixSetMasked(). */
628         pixb = NULL;
629         if (notext == FALSE) {
630             d = pixGetDepth(pixt);
631             if (d == 8)
632                 pixg = pixCopy(NULL, pixt);
633             else  /* d == 32 */
634                 pixg = pixConvertRGBToLuminance(pixt);
635             if (pixm)  /* clear out the image parts */
636                 pixSetMasked(pixg, pixm, 255);
637             if (textscale == 1.0)
638                 pixsc = pixClone(pixg);
639             else if (textscale >= 0.7)
640                 pixsc = pixScaleGrayLI(pixg, textscale, textscale);
641             else
642                 pixsc = pixScaleAreaMap(pixg, textscale, textscale);
643             pixb = pixThresholdToBinary(pixsc, threshold);
644             pixDestroy(&pixg);
645             pixDestroy(&pixsc);
646         }
647 
648             /* Get the scaled image region */
649         pixc = NULL;
650         if (pixm) {
651             if (imagescale == 1.0)
652                 pixsc = pixClone(pixt);  /* can possibly be a clone of pixs */
653             else
654                 pixsc = pixScale(pixt, imagescale, imagescale);
655 
656                 /* If pixm is not full, clear the pixels in pixsc
657                  * corresponding to bg in pixm, where there can be text
658                  * that is written through the mask pixb.  Note that
659                  * we could skip this and use pixsc directly in
660                  * pixWriteMixedToPS(); however, clearing these
661                  * non-image regions to a white background will reduce
662                  * the size of pixc (relative to pixsc), and hence
663                  * reduce the size of the PS file that is generated.
664                  * Use a copy so that we don't accidentally alter pixs.  */
665             if (notext == FALSE) {
666                 pixmis = pixScale(pixm, imagescale, imagescale);
667                 pixmi = pixInvert(NULL, pixmis);
668                 val = (d == 8) ? 0xff : 0xffffff00;
669                 pixc = pixCopy(NULL, pixsc);
670                 pixSetMasked(pixc, pixmi, val);  /* clear non-image part */
671                 pixDestroy(&pixmis);
672                 pixDestroy(&pixmi);
673             } else {
674                 pixc = pixClone(pixsc);
675             }
676             pixDestroy(&pixsc);
677         }
678         pixDestroy(&pixt);
679     }
680 
681         /* Generate the PS file.  Don't use bounding boxes. */
682     l_psWriteBoundingBox(FALSE);
683     ret = pixWriteMixedToPS(pixb, pixc, scaleratio, pageno, fileout);
684     l_psWriteBoundingBox(TRUE);
685     pixDestroy(&pixb);
686     pixDestroy(&pixc);
687     return ret;
688 }
689 
690 
691 /*
692  *  pixWriteMixedToPS()
693  *
694  *      Input:  pixb (<optionall> 1 bpp "mask"; typically for text)
695  *              pixc (<optional> 8 or 32 bpp image regions)
696  *              scale (relative scale factor for rendering pixb
697  *                    relative to pixc; typ. 4.0)
698  *              pageno (page number in set; use 1 for new output file)
699  *              fileout (output ps file)
700  *      Return: 0 if OK, 1 on error
701  *
702  *  Notes:
703  *      (1) This low level function generates the PS string for a mixed
704  *          text/image page, and adds it to an existing file if
705  *          %pageno > 1.
706  *      (2) The two images (pixb and pixc) are typically generated at the
707  *          resolution that they will be rendered in the PS file.
708  *      (3) pixb is the text component.  In the PostScript world, we think of
709  *          it as a mask through which we paint black.
710  *      (4) pixc is the (typically halftone) image component.  It is
711  *          white in the rest of the page.  To minimize the size of the
712  *          PS file, it should be rendered at a resolution that is at
713  *          least equal to its actual resolution.
714  *      (5) %scale gives the ratio of resolution of pixb to pixc.
715  *          Typical resolutions are: 600 ppi for pixb, 150 ppi for pixc;
716  *          so %scale = 4.0.  If one of the images is not defined,
717  *          the value of %scale is ignored.
718  *      (6) We write pixc with DCT compression (jpeg).  This is followed
719  *          by painting the text as black through the mask pixb.  If
720  *          pixc doesn't exist (alltext), we write the text with the
721  *          PS "image" operator instead of the "imagemask" operator,
722  *          because ghostscript's ps2pdf is flaky when the latter is used.
723  *      (7) The actual output resolution is determined by fitting the
724  *          result to a letter-size (8.5 x 11 inch) page.
725  */
726 l_int32
pixWriteMixedToPS(PIX * pixb,PIX * pixc,l_float32 scale,l_int32 pageno,const char * fileout)727 pixWriteMixedToPS(PIX         *pixb,
728                   PIX         *pixc,
729                   l_float32    scale,
730                   l_int32      pageno,
731                   const char  *fileout)
732 {
733 char        *tname;
734 const char  *op;
735 l_int32      resb, resc, endpage, maskop, ret;
736 
737     PROCNAME("pixWriteMixedToPS");
738 
739     if (!pixb && !pixc)
740         return ERROR_INT("pixb and pixc both undefined", procName, 1);
741     if (!fileout)
742         return ERROR_INT("fileout not defined", procName, 1);
743 
744         /* Compute the resolution that fills a letter-size page. */
745     if (!pixc) {
746        resb = getResLetterPage(pixGetWidth(pixb), pixGetHeight(pixb), 0);
747     } else {
748        resc = getResLetterPage(pixGetWidth(pixc), pixGetHeight(pixc), 0);
749        if (pixb)
750            resb = (l_int32)(scale * resc);
751     }
752 
753         /* Write the jpeg image first */
754     if (pixc) {
755         tname = l_makeTempFilename();
756         pixWrite(tname, pixc, IFF_JFIF_JPEG);
757         endpage = (pixb) ? FALSE : TRUE;
758         op = (pageno <= 1) ? "w" : "a";
759         ret = convertJpegToPS(tname, fileout, op, 0, 0, resc, 1.0,
760                               pageno, endpage);
761         lept_rmfile(tname);
762         LEPT_FREE(tname);
763         if (ret)
764             return ERROR_INT("jpeg data not written", procName, 1);
765     }
766 
767         /* Write the binary data, either directly or, if there is
768          * a jpeg image on the page, through the mask. */
769     if (pixb) {
770         tname = l_makeTempFilename();
771         pixWrite(tname, pixb, IFF_TIFF_G4);
772         op = (pageno <= 1 && !pixc) ? "w" : "a";
773         maskop = (pixc) ? 1 : 0;
774         ret = convertG4ToPS(tname, fileout, op, 0, 0, resb, 1.0,
775                             pageno, maskop, 1);
776         lept_rmfile(tname);
777         LEPT_FREE(tname);
778         if (ret)
779             return ERROR_INT("tiff data not written", procName, 1);
780     }
781 
782     return 0;
783 }
784 
785 
786 /*-------------------------------------------------------------*
787  *            Convert any image file to PS for embedding       *
788  *-------------------------------------------------------------*/
789 /*
790  *  convertToPSEmbed()
791  *
792  *      Input:  filein (input image file -- any format)
793  *              fileout (output ps file)
794  *              level (compression: 1 (uncompressed), 2 or 3)
795  *      Return: 0 if OK, 1 on error
796  *
797  *  Notes:
798  *      (1) This is a wrapper function that generates a PS file with
799  *          a bounding box, from any input image file.
800  *      (2) Do the best job of compression given the specified level.
801  *          %level=3 does flate compression on anything that is not
802  *          tiffg4 (1 bpp) or jpeg (8 bpp or rgb).
803  *      (3) If %level=2 and the file is not tiffg4 or jpeg, it will
804  *          first be written to file as jpeg with quality = 75.
805  *          This will remove the colormap and cause some degradation
806  *          in the image.
807  *      (4) The bounding box is required when a program such as TeX
808  *          (through epsf) places and rescales the image.  It is
809  *          sized for fitting the image to an 8.5 x 11.0 inch page.
810  */
811 l_int32
convertToPSEmbed(const char * filein,const char * fileout,l_int32 level)812 convertToPSEmbed(const char  *filein,
813                  const char  *fileout,
814                  l_int32      level)
815 {
816 char    *tname;
817 l_int32  d, format;
818 PIX     *pix, *pixs;
819 
820     PROCNAME("convertToPSEmbed");
821 
822     if (!filein)
823         return ERROR_INT("filein not defined", procName, 1);
824     if (!fileout)
825         return ERROR_INT("fileout not defined", procName, 1);
826     if (level != 1 && level != 2 && level != 3) {
827         L_ERROR("invalid level specified; using level 2\n", procName);
828         level = 2;
829     }
830 
831     if (level == 1) {  /* no compression */
832         pixWritePSEmbed(filein, fileout);
833         return 0;
834     }
835 
836         /* Find the format and write out directly if in jpeg or tiff g4 */
837     findFileFormat(filein, &format);
838     if (format == IFF_JFIF_JPEG) {
839         convertJpegToPSEmbed(filein, fileout);
840         return 0;
841     } else if (format == IFF_TIFF_G4) {
842         convertG4ToPSEmbed(filein, fileout);
843         return 0;
844     } else if (format == IFF_UNKNOWN) {
845         L_ERROR("format of %s not known\n", procName, filein);
846         return 1;
847     }
848 
849         /* If level 3, flate encode. */
850     if (level == 3) {
851         convertFlateToPSEmbed(filein, fileout);
852         return 0;
853     }
854 
855         /* OK, it's level 2, so we must convert to jpeg or tiff g4 */
856     if ((pixs = pixRead(filein)) == NULL)
857         return ERROR_INT("image not read from file", procName, 1);
858     d = pixGetDepth(pixs);
859     if ((d == 2 || d == 4) && !pixGetColormap(pixs))
860         pix = pixConvertTo8(pixs, 0);
861     else if (d == 16)
862         pix = pixConvert16To8(pixs, 1);
863     else
864         pix = pixRemoveColormap(pixs, REMOVE_CMAP_BASED_ON_SRC);
865 
866     d = pixGetDepth(pix);
867     tname = l_makeTempFilename();
868     if (d == 1) {
869         pixWrite(tname, pix, IFF_TIFF_G4);
870         convertG4ToPSEmbed(tname, fileout);
871     } else {
872         pixWrite(tname, pix, IFF_JFIF_JPEG);
873         convertJpegToPSEmbed(tname, fileout);
874     }
875 
876     lept_rmfile(tname);
877     LEPT_FREE(tname);
878     pixDestroy(&pix);
879     pixDestroy(&pixs);
880     return 0;
881 }
882 
883 
884 /*-------------------------------------------------------------*
885  *              Write all images in a pixa out to PS           *
886  *-------------------------------------------------------------*/
887 /*
888  *  pixaWriteCompressedToPS()
889  *
890  *      Input:  pixa (any set of images)
891  *              fileout (output ps file)
892  *              res (of input image)
893  *              level (compression: 2 or 3)
894  *      Return: 0 if OK, 1 on error
895  *
896  *  Notes:
897  *      (1) This generates a PS file of multiple page images, all
898  *          with bounding boxes.
899  *      (2) It compresses to:
900  *              cmap + level2:        jpeg
901  *              cmap + level3:        flate
902  *              1 bpp:                tiffg4
903  *              2 or 4 bpp + level2:  jpeg
904  *              2 or 4 bpp + level3:  flate
905  *              8 bpp:                jpeg
906  *              16 bpp:               flate
907  *              32 bpp:               jpeg
908  *      (3) To generate a pdf, use: ps2pdf <infile.ps> <outfile.pdf>
909  */
910 l_int32
pixaWriteCompressedToPS(PIXA * pixa,const char * fileout,l_int32 res,l_int32 level)911 pixaWriteCompressedToPS(PIXA        *pixa,
912                         const char  *fileout,
913                         l_int32      res,
914                         l_int32      level)
915 {
916 char     *tname;
917 l_int32   i, n, firstfile, index, writeout, d;
918 PIX      *pix, *pixt;
919 PIXCMAP  *cmap;
920 
921     PROCNAME("pixaWriteCompressedToPS");
922 
923     if (!pixa)
924         return ERROR_INT("pixa not defined", procName, 1);
925     if (!fileout)
926         return ERROR_INT("fileout not defined", procName, 1);
927     if (level != 2 && level != 3) {
928         L_ERROR("only levels 2 and 3 permitted; using level 2\n", procName);
929         level = 2;
930     }
931 
932     n = pixaGetCount(pixa);
933     firstfile = TRUE;
934     index = 0;
935     tname = l_makeTempFilename();
936     for (i = 0; i < n; i++) {
937         writeout = TRUE;
938         pix = pixaGetPix(pixa, i, L_CLONE);
939         d = pixGetDepth(pix);
940         cmap = pixGetColormap(pix);
941         if (d == 1) {
942             pixWrite(tname, pix, IFF_TIFF_G4);
943         } else if (cmap) {
944             if (level == 2) {
945                 pixt = pixConvertForPSWrap(pix);
946                 pixWrite(tname, pixt, IFF_JFIF_JPEG);
947                 pixDestroy(&pixt);
948             } else {  /* level == 3 */
949                 pixWrite(tname, pix, IFF_PNG);
950             }
951         } else if (d == 16) {
952             if (level == 2)
953                 L_WARNING("d = 16; must write out flate\n", procName);
954             pixWrite(tname, pix, IFF_PNG);
955         } else if (d == 2 || d == 4) {
956             if (level == 2) {
957                 pixt = pixConvertTo8(pix, 0);
958                 pixWrite(tname, pixt, IFF_JFIF_JPEG);
959                 pixDestroy(&pixt);
960             } else {  /* level == 3 */
961                 pixWrite(tname, pix, IFF_PNG);
962             }
963         } else if (d == 8 || d == 32) {
964             pixWrite(tname, pix, IFF_JFIF_JPEG);
965         } else {  /* shouldn't happen */
966             L_ERROR("invalid depth: %d\n", procName, d);
967             writeout = FALSE;
968         }
969         pixDestroy(&pix);
970 
971         if (writeout)
972             writeImageCompressedToPSFile(tname, fileout, res,
973                                          &firstfile, &index);
974     }
975 
976     lept_rmfile(tname);
977     LEPT_FREE(tname);
978     return 0;
979 }
980 
981 
982 /* --------------------------------------------*/
983 #endif  /* USE_PSIO */
984 /* --------------------------------------------*/
985