leptonica-1.76.0/src/scale1.c

/*====================================================================*
 -  Copyright (C) 2001 Leptonica.  All rights reserved.
 -
 -  Redistribution and use in source and binary forms, with or without
 -  modification, are permitted provided that the following conditions
 -  are met:
 -  1. Redistributions of source code must retain the above copyright
 -     notice, this list of conditions and the following disclaimer.
 -  2. Redistributions in binary form must reproduce the above
 -     copyright notice, this list of conditions and the following
 -     disclaimer in the documentation and/or other materials
 -     provided with the distribution.
 -
 -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
 -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *====================================================================*/

/*!
 * \file scale1.c
 * <pre>
 *         Top-level scaling
 *               PIX      *pixScale()
 *               PIX      *pixScaleToSizeRel()
 *               PIX      *pixScaleToSize()
 *               PIX      *pixScaleGeneral()
 *
 *         Linearly interpreted (usually up-) scaling
 *               PIX      *pixScaleLI()
 *               PIX      *pixScaleColorLI()
 *               PIX      *pixScaleColor2xLI()
 *               PIX      *pixScaleColor4xLI()
 *               PIX      *pixScaleGrayLI()
 *               PIX      *pixScaleGray2xLI()
 *               PIX      *pixScaleGray4xLI()
 *
 *         Upscale 2x followed by binarization
 *               PIX      *pixScaleGray2xLIThresh()
 *               PIX      *pixScaleGray2xLIDither()
 *
 *         Upscale 4x followed by binarization
 *               PIX      *pixScaleGray4xLIThresh()
 *               PIX      *pixScaleGray4xLIDither()
 *
 *         Scaling by closest pixel sampling
 *               PIX      *pixScaleBySampling()
 *               PIX      *pixScaleBySamplingToSize()
 *               PIX      *pixScaleByIntSampling()
 *
 *         Fast integer factor subsampling RGB to gray and to binary
 *               PIX      *pixScaleRGBToGrayFast()
 *               PIX      *pixScaleRGBToBinaryFast()
 *               PIX      *pixScaleGrayToBinaryFast()
 *
 *         Downscaling with (antialias) smoothing
 *               PIX      *pixScaleSmooth()
 *               PIX      *pixScaleSmoothToSize()
 *               PIX      *pixScaleRGBToGray2()   [special 2x reduction to gray]
 *
 *         Downscaling with (antialias) area mapping
 *               PIX      *pixScaleAreaMap()
 *               PIX      *pixScaleAreaMap2()
 *               PIX      *pixScaleAreaMapToSize()
 *
 *         Binary scaling by closest pixel sampling
 *               PIX      *pixScaleBinary()
 *
 *     Low-level static functions:
 *
 *         Color (interpolated) scaling: general case
 *               static void       scaleColorLILow()
 *
 *         Grayscale (interpolated) scaling: general case
 *               static void       scaleGrayLILow()
 *
 *         Color (interpolated) scaling: 2x upscaling
 *               static void       scaleColor2xLILow()
 *               static void       scaleColor2xLILineLow()
 *
 *         Grayscale (interpolated) scaling: 2x upscaling
 *               static void       scaleGray2xLILow()
 *               static void       scaleGray2xLILineLow()
 *
 *         Grayscale (interpolated) scaling: 4x upscaling
 *               static void       scaleGray4xLILow()
 *               static void       scaleGray4xLILineLow()
 *
 *         Grayscale and color scaling by closest pixel sampling
 *               static l_int32    scaleBySamplingLow()
 *
 *         Color and grayscale downsampling with (antialias) lowpass filter
 *               static l_int32    scaleSmoothLow()
 *               static void       scaleRGBToGray2Low()
 *
 *         Color and grayscale downsampling with (antialias) area mapping
 *               static l_int32    scaleColorAreaMapLow()
 *               static l_int32    scaleGrayAreaMapLow()
 *               static l_int32    scaleAreaMapLow2()
 *
 *         Binary scaling by closest pixel sampling
 *               static l_int32    scaleBinaryLow()
 * </pre>
 */

#include <string.h>
#include "allheaders.h"

static void scaleColorLILow(l_uint32 *datad, l_int32 wd, l_int32 hd,
                            l_int32 wpld, l_uint32 *datas, l_int32 ws,
                            l_int32 hs, l_int32 wpls);
static void scaleGrayLILow(l_uint32 *datad, l_int32 wd, l_int32 hd,
                           l_int32 wpld, l_uint32 *datas, l_int32 ws,
                           l_int32 hs, l_int32 wpls);
static void scaleColor2xLILow(l_uint32 *datad, l_int32 wpld, l_uint32 *datas,
                              l_int32 ws, l_int32 hs, l_int32 wpls);
static void scaleColor2xLILineLow(l_uint32 *lined, l_int32 wpld,
                                  l_uint32 *lines, l_int32 ws, l_int32 wpls,
                                  l_int32 lastlineflag);
static void scaleGray2xLILow(l_uint32 *datad, l_int32 wpld, l_uint32 *datas,
                             l_int32 ws, l_int32 hs, l_int32 wpls);
static void scaleGray2xLILineLow(l_uint32 *lined, l_int32 wpld,
                                 l_uint32 *lines, l_int32 ws, l_int32 wpls,
                                 l_int32 lastlineflag);
static void scaleGray4xLILow(l_uint32 *datad, l_int32 wpld, l_uint32 *datas,
                             l_int32 ws, l_int32 hs, l_int32 wpls);
static void scaleGray4xLILineLow(l_uint32 *lined, l_int32 wpld,
                                 l_uint32 *lines, l_int32 ws, l_int32 wpls,
                                 l_int32 lastlineflag);
static l_int32 scaleBySamplingLow(l_uint32 *datad, l_int32 wd, l_int32 hd,
                                  l_int32 wpld, l_uint32 *datas, l_int32 ws,
                                  l_int32 hs, l_int32 d, l_int32 wpls);
static l_int32 scaleSmoothLow(l_uint32 *datad, l_int32 wd, l_int32 hd,
                              l_int32 wpld, l_uint32 *datas, l_int32 ws,
                              l_int32 hs, l_int32 d, l_int32 wpls,
                              l_int32 size);
static void scaleRGBToGray2Low(l_uint32 *datad, l_int32 wd, l_int32 hd,
                               l_int32 wpld, l_uint32 *datas, l_int32 wpls,
                               l_float32 rwt, l_float32 gwt, l_float32 bwt);
static void scaleColorAreaMapLow(l_uint32 *datad, l_int32 wd, l_int32 hd,
                                 l_int32 wpld, l_uint32 *datas, l_int32 ws,
                                 l_int32 hs, l_int32 wpls);
static void scaleGrayAreaMapLow(l_uint32 *datad, l_int32 wd, l_int32 hd,
                                l_int32 wpld, l_uint32 *datas, l_int32 ws,
                                l_int32 hs, l_int32 wpls);
static void scaleAreaMapLow2(l_uint32 *datad, l_int32 wd, l_int32 hd,
                             l_int32 wpld, l_uint32 *datas, l_int32 d,
                             l_int32 wpls);
static l_int32 scaleBinaryLow(l_uint32 *datad, l_int32 wd, l_int32 hd,
                              l_int32 wpld, l_uint32 *datas, l_int32 ws,
                              l_int32 hs, l_int32 wpls);

#ifndef  NO_CONSOLE_IO
#define  DEBUG_OVERFLOW   0
#define  DEBUG_UNROLLING  0
#endif  /* ~NO_CONSOLE_IO */


/*------------------------------------------------------------------*
 *                    Top level scaling dispatcher                  *
 *------------------------------------------------------------------*/
/*!
 * \brief   pixScale()
 *
 * \param[in]    pixs 1, 2, 4, 8, 16 and 32 bpp
 * \param[in]    scalex, scaley
 * \return  pixd, or NULL on error
 *
 *  This function scales 32 bpp RGB; 2, 4 or 8 bpp palette color;
 *  2, 4, 8 or 16 bpp gray; and binary images.
 *
 *  When the input has palette color, the colormap is removed and
 *  the result is either 8 bpp gray or 32 bpp RGB, depending on whether
 *  the colormap has color entries.  Images with 2, 4 or 16 bpp are
 *  converted to 8 bpp.
 *
 *  Because pixScale is meant to be a very simple interface to a
 *  number of scaling functions, including the use of unsharp masking,
 *  the type of scaling and the sharpening parameters are chosen
 *  by default.  Grayscale and color images are scaled using one
 *  of four methods, depending on the scale factors:
 *   1 antialiased subsampling (lowpass filtering followed by
 *       subsampling, implemented here by area mapping), for scale factors
 *       less than 0.2
 *   2 antialiased subsampling with sharpening, for scale factors
 *       between 0.2 and 0.7
 *   3 linear interpolation with sharpening, for scale factors between
 *       0.7 and 1.4
 *   4 linear interpolation without sharpening, for scale factors >= 1.4.
 *
 *  One could use subsampling for scale factors very close to 1.0,
 *  because it preserves sharp edges.  Linear interpolation blurs
 *  edges because the dest pixels will typically straddle two src edge
 *  pixels.  Subsmpling removes entire columns and rows, so the edge is
 *  not blurred.  However, there are two reasons for not doing this.
 *  First, it moves edges, so that a straight line at a large angle to
 *  both horizontal and vertical will have noticeable kinks where
 *  horizontal and vertical rasters are removed.  Second, although it
 *  is very fast, you get good results on sharp edges by applying
 *  a sharpening filter.
 *
 *  For images with sharp edges, sharpening substantially improves the
 *  image quality for scale factors between about 0.2 and about 2.0.
 *  pixScale uses a small amount of sharpening by default because
 *  it strengthens edge pixels that are weak due to anti-aliasing.
 *  The default sharpening factors are:
 *      * for scaling factors < 0.7:   sharpfract = 0.2    sharpwidth = 1
 *      * for scaling factors >= 0.7:  sharpfract = 0.4    sharpwidth = 2
 *  The cases where the sharpening halfwidth is 1 or 2 have special
 *  implementations and are about twice as fast as the general case.
 *
 *  However, sharpening is computationally expensive, and one needs
 *  to consider the speed-quality tradeoff:
 *      * For upscaling of RGB images, linear interpolation plus default
 *        sharpening is about 5 times slower than upscaling alone.
 *      * For downscaling, area mapping plus default sharpening is
 *        about 10 times slower than downscaling alone.
 *  When the scale factor is larger than 1.4, the cost of sharpening,
 *  which is proportional to image area, is very large compared to the
 *  incremental quality improvement, so we cut off the default use of
 *  sharpening at 1.4.  Thus, for scale factors greater than 1.4,
 *  pixScale only does linear interpolation.
 *
 *  In many situations you will get a satisfactory result by scaling
 *  without sharpening: call pixScaleGeneral with %sharpfract = 0.0.
 *  Alternatively, if you wish to sharpen but not use the default
 *  value, first call pixScaleGeneral with %sharpfract = 0.0, and
 *  then sharpen explicitly using pixUnsharpMasking.
 *
 *  Binary images are scaled to binary by sampling the closest pixel,
 *  without any low-pass filtering averaging of neighboring pixels.
 *  This will introduce aliasing for reductions.  Aliasing can be
 *  prevented by using pixScaleToGray instead.
 */
PIX *
pixScale(PIX       *pixs,
         l_float32  scalex,
         l_float32  scaley)
{
l_int32    sharpwidth;
l_float32  maxscale, sharpfract;

    PROCNAME("pixScale");

    if (!pixs)
        return (PIX *)ERROR_PTR("pixs not defined", procName, NULL);

        /* Reduce the default sharpening factors by 2 if maxscale < 0.7 */
    maxscale = L_MAX(scalex, scaley);
    sharpfract = (maxscale < 0.7) ? 0.2 : 0.4;
    sharpwidth = (maxscale < 0.7) ? 1 : 2;

    return pixScaleGeneral(pixs, scalex, scaley, sharpfract, sharpwidth);
}


/*!
 * \brief   pixScaleToSizeRel()
 *
 * \param[in]    pixs
 * \param[in]    delw  change in width, in pixels; 0 means no change
 * \param[in]    delh  change in height, in pixels; 0 means no change
 * \return  pixd, or NULL on error
 */
PIX *
pixScaleToSizeRel(PIX     *pixs,
                  l_int32  delw,
                  l_int32  delh)
{
l_int32  w, h, wd, hd;

    PROCNAME("pixScaleToSizeRel");

    if (!pixs)
        return (PIX *)ERROR_PTR("pixs not defined", procName, NULL);

    if (delw == 0 && delh == 0)
        return pixCopy(NULL, pixs);

    pixGetDimensions(pixs, &w, &h, NULL);
    wd = w + delw;
    hd = h + delh;
    if (wd <= 0 || hd <= 0)
        return (PIX *)ERROR_PTR("pix dimension reduced to 0", procName, NULL);

    return pixScaleToSize(pixs, wd, hd);
}


/*!
 * \brief   pixScaleToSize()
 *
 * \param[in]    pixs 1, 2, 4, 8, 16 and 32 bpp
 * \param[in]    wd  target width; use 0 if using height as target
 * \param[in]    hd  target height; use 0 if using width as target
 * \return  pixd, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) The output scaled image has the dimension(s) you specify:
 *          * To specify the width with isotropic scaling, set %hd = 0.
 *          * To specify the height with isotropic scaling, set %wd = 0.
 *          * If both %wd and %hd are specified, the image is scaled
 *             (in general, anisotropically) to that size.
 *          * It is an error to set both %wd and %hd to 0.
 * </pre>
 */
PIX *
pixScaleToSize(PIX     *pixs,
               l_int32  wd,
               l_int32  hd)
{
l_int32    w, h;
l_float32  scalex, scaley;

    PROCNAME("pixScaleToSize");

    if (!pixs)
        return (PIX *)ERROR_PTR("pixs not defined", procName, NULL);
    if (wd <= 0 && hd <= 0)
        return (PIX *)ERROR_PTR("neither wd nor hd > 0", procName, NULL);

    pixGetDimensions(pixs, &w, &h, NULL);
    if (wd <= 0) {
        scaley = (l_float32)hd / (l_float32)h;
        scalex = scaley;
    } else if (hd <= 0) {
        scalex = (l_float32)wd / (l_float32)w;
        scaley = scalex;
    } else {
        scalex = (l_float32)wd / (l_float32)w;
        scaley = (l_float32)hd / (l_float32)h;
    }

    return pixScale(pixs, scalex, scaley);
}


/*!
 * \brief   pixScaleGeneral()
 *
 * \param[in]    pixs 1, 2, 4, 8, 16 and 32 bpp
 * \param[in]    scalex, scaley both > 0.0
 * \param[in]    sharpfract use 0.0 to skip sharpening
 * \param[in]    sharpwidth halfwidth of low-pass filter; typ. 1 or 2
 * \return  pixd, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) See pixScale() for usage.
 *      (2) This interface may change in the future, as other special
 *          cases are added.
 *      (3) The actual sharpening factors used depend on the maximum
 *          of the two scale factors (maxscale):
 *            maxscale <= 0.2:        no sharpening
 *            0.2 < maxscale < 1.4:   uses the input parameters
 *            maxscale >= 1.4:        no sharpening
 *      (4) To avoid sharpening for grayscale and color images with
 *          scaling factors between 0.2 and 1.4, call this function
 *          with %sharpfract == 0.0.
 *      (5) To use arbitrary sharpening in conjunction with scaling,
 *          call this function with %sharpfract = 0.0, and follow this
 *          with a call to pixUnsharpMasking() with your chosen parameters.
 * </pre>
 */
PIX *
pixScaleGeneral(PIX       *pixs,
                l_float32  scalex,
                l_float32  scaley,
                l_float32  sharpfract,
                l_int32    sharpwidth)
{
l_int32    d;
l_float32  maxscale;
PIX       *pixt, *pixt2, *pixd;

    PROCNAME("pixScaleGeneral");

    if (!pixs)
        return (PIX *)ERROR_PTR("pixs not defined", procName, NULL);
    d = pixGetDepth(pixs);
    if (d != 1 && d != 2 && d != 4 && d != 8 && d != 16 && d != 32)
        return (PIX *)ERROR_PTR("pixs not {1,2,4,8,16,32} bpp", procName, NULL);
    if (scalex <= 0.0 || scaley <= 0.0)
        return (PIX *)ERROR_PTR("scale factor <= 0", procName, NULL);
    if (scalex == 1.0 && scaley == 1.0)
        return pixCopy(NULL, pixs);

    if (d == 1)
        return pixScaleBinary(pixs, scalex, scaley);

        /* Remove colormap; clone if possible; result is either 8 or 32 bpp */
    if ((pixt = pixConvertTo8Or32(pixs, L_CLONE, 0)) == NULL)
        return (PIX *)ERROR_PTR("pixt not made", procName, NULL);

        /* Scale (up or down) */
    d = pixGetDepth(pixt);
    maxscale = L_MAX(scalex, scaley);
    if (maxscale < 0.7) {  /* area mapping for anti-aliasing */
        pixt2 = pixScaleAreaMap(pixt, scalex, scaley);
        if (maxscale > 0.2 && sharpfract > 0.0 && sharpwidth > 0)
            pixd = pixUnsharpMasking(pixt2, sharpwidth, sharpfract);
        else
            pixd = pixClone(pixt2);
    } else {  /* use linear interpolation */
        if (d == 8)
            pixt2 = pixScaleGrayLI(pixt, scalex, scaley);
        else  /* d == 32 */
            pixt2 = pixScaleColorLI(pixt, scalex, scaley);
        if (maxscale < 1.4 && sharpfract > 0.0 && sharpwidth > 0)
            pixd = pixUnsharpMasking(pixt2, sharpwidth, sharpfract);
        else
            pixd = pixClone(pixt2);
    }

    pixDestroy(&pixt);
    pixDestroy(&pixt2);
    pixCopyText(pixd, pixs);
    pixCopyInputFormat(pixd, pixs);
    return pixd;
}


/*------------------------------------------------------------------*
 *                  Scaling by linear interpolation                 *
 *------------------------------------------------------------------*/
/*!
 * \brief   pixScaleLI()
 *
 * \param[in]    pixs 2, 4, 8 or 32 bpp; with or without colormap
 * \param[in]    scalex, scaley must both be >= 0.7
 * \return  pixd, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This function should only be used when the scale factors are
 *          greater than or equal to 0.7, and typically greater than 1.
 *          If either scale factor is larger than 0.7, we issue a warning
 *          and call pixScaleGeneral(), which will invoke area mapping
 *          without sharpening.
 *      (2) This works on 2, 4, 8, 16 and 32 bpp images, as well as on
 *          2, 4 and 8 bpp images that have a colormap.  If there is a
 *          colormap, it is removed to either gray or RGB, depending
 *          on the colormap.
 *      (3) This does a linear interpolation on the src image.
 *      (4) It dispatches to much faster implementations for
 *          the special cases of 2x and 4x expansion.
 * </pre>
 */
PIX *
pixScaleLI(PIX       *pixs,
           l_float32  scalex,
           l_float32  scaley)
{
l_int32    d;
l_float32  maxscale;
PIX       *pixt, *pixd;

    PROCNAME("pixScaleLI");

    if (!pixs || (pixGetDepth(pixs) == 1))
        return (PIX *)ERROR_PTR("pixs not defined or 1 bpp", procName, NULL);
    maxscale = L_MAX(scalex, scaley);
    if (maxscale < 0.7) {
        L_WARNING("scaling factors < 0.7; do regular scaling\n", procName);
        return pixScaleGeneral(pixs, scalex, scaley, 0.0, 0);
    }
    d = pixGetDepth(pixs);
    if (d != 2 && d != 4 && d != 8 && d != 16 && d != 32)
        return (PIX *)ERROR_PTR("pixs not {2,4,8,16,32} bpp", procName, NULL);

        /* Remove colormap; clone if possible; result is either 8 or 32 bpp */
    if ((pixt = pixConvertTo8Or32(pixs, L_CLONE, 0)) == NULL)
        return (PIX *)ERROR_PTR("pixt not made", procName, NULL);

    d = pixGetDepth(pixt);
    if (d == 8)
        pixd = pixScaleGrayLI(pixt, scalex, scaley);
    else  /* d == 32 */
        pixd = pixScaleColorLI(pixt, scalex, scaley);

    pixDestroy(&pixt);
    pixCopyInputFormat(pixd, pixs);
    return pixd;
}


/*!
 * \brief   pixScaleColorLI()
 *
 * \param[in]    pixs  32 bpp, representing rgb
 * \param[in]    scalex, scaley must both be >= 0.7
 * \return  pixd, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) If either scale factor is larger than 0.7, we issue a warning
 *          and call pixScaleGeneral(), which will invoke area mapping
 *          without sharpening.  This is particularly important for
 *          document images with sharp edges.
 *      (2) For the general case, it's about 4x faster to manipulate
 *          the color pixels directly, rather than to make images
 *          out of each of the 3 components, scale each component
 *          using the pixScaleGrayLI(), and combine the results back
 *          into an rgb image.
 *      (3) The speed on intel hardware for the general case (not 2x)
 *          is about 10 * 10^6 dest-pixels/sec/GHz.  (The special 2x
 *          case runs at about 80 * 10^6 dest-pixels/sec/GHz.)
 * </pre>
 */
PIX *
pixScaleColorLI(PIX      *pixs,
               l_float32  scalex,
               l_float32  scaley)
{
l_int32    ws, hs, wpls, wd, hd, wpld;
l_uint32  *datas, *datad;
l_float32  maxscale;
PIX       *pixd;

    PROCNAME("pixScaleColorLI");

    if (!pixs || (pixGetDepth(pixs) != 32))
        return (PIX *)ERROR_PTR("pixs undefined or not 32 bpp", procName, NULL);
    maxscale = L_MAX(scalex, scaley);
    if (maxscale < 0.7) {
        L_WARNING("scaling factors < 0.7; do regular scaling\n", procName);
        return pixScaleGeneral(pixs, scalex, scaley, 0.0, 0);
    }

        /* Do fast special cases if possible */
    if (scalex == 1.0 && scaley == 1.0)
        return pixCopy(NULL, pixs);
    if (scalex == 2.0 && scaley == 2.0)
        return pixScaleColor2xLI(pixs);
    if (scalex == 4.0 && scaley == 4.0)
        return pixScaleColor4xLI(pixs);

        /* General case */
    pixGetDimensions(pixs, &ws, &hs, NULL);
    datas = pixGetData(pixs);
    wpls = pixGetWpl(pixs);
    wd = (l_int32)(scalex * (l_float32)ws + 0.5);
    hd = (l_int32)(scaley * (l_float32)hs + 0.5);
    if ((pixd = pixCreate(wd, hd, 32)) == NULL)
        return (PIX *)ERROR_PTR("pixd not made", procName, NULL);
    pixCopyResolution(pixd, pixs);
    pixScaleResolution(pixd, scalex, scaley);
    datad = pixGetData(pixd);
    wpld = pixGetWpl(pixd);
    scaleColorLILow(datad, wd, hd, wpld, datas, ws, hs, wpls);
    if (pixGetSpp(pixs) == 4)
        pixScaleAndTransferAlpha(pixd, pixs, scalex, scaley);

    pixCopyInputFormat(pixd, pixs);
    return pixd;
}


/*!
 * \brief   pixScaleColor2xLI()
 *
 * \param[in]    pixs  32 bpp, representing rgb
 * \return  pixd, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This is a special case of linear interpolated scaling,
 *          for 2x upscaling.  It is about 8x faster than using
 *          the generic pixScaleColorLI(), and about 4x faster than
 *          using the special 2x scale function pixScaleGray2xLI()
 *          on each of the three components separately.
 *      (2) The speed on intel hardware is about
 *          80 * 10^6 dest-pixels/sec/GHz.
 * </pre>
 */
PIX *
pixScaleColor2xLI(PIX  *pixs)
{
l_int32    ws, hs, wpls, wpld;
l_uint32  *datas, *datad;
PIX       *pixd;

    PROCNAME("pixScaleColor2xLI");

    if (!pixs || (pixGetDepth(pixs) != 32))
        return (PIX *)ERROR_PTR("pixs undefined or not 32 bpp", procName, NULL);

    pixGetDimensions(pixs, &ws, &hs, NULL);
    datas = pixGetData(pixs);
    wpls = pixGetWpl(pixs);
    if ((pixd = pixCreate(2 * ws, 2 * hs, 32)) == NULL)
        return (PIX *)ERROR_PTR("pixd not made", procName, NULL);
    pixCopyResolution(pixd, pixs);
    pixScaleResolution(pixd, 2.0, 2.0);
    datad = pixGetData(pixd);
    wpld = pixGetWpl(pixd);
    scaleColor2xLILow(datad, wpld, datas, ws, hs, wpls);
    if (pixGetSpp(pixs) == 4)
        pixScaleAndTransferAlpha(pixd, pixs, 2.0, 2.0);

    pixCopyInputFormat(pixd, pixs);
    return pixd;
}


/*!
 * \brief   pixScaleColor4xLI()
 *
 * \param[in]    pixs  32 bpp, representing rgb
 * \return  pixd, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This is a special case of color linear interpolated scaling,
 *          for 4x upscaling.  It is about 3x faster than using
 *          the generic pixScaleColorLI().
 *      (2) The speed on intel hardware is about
 *          30 * 10^6 dest-pixels/sec/GHz
 *      (3) This scales each component separately, using pixScaleGray4xLI().
 *          It would be about 4x faster to inline the color code properly,
 *          in analogy to scaleColor4xLILow(), and I leave this as
 *          an exercise for someone who really needs it.
 * </pre>
 */
PIX *
pixScaleColor4xLI(PIX  *pixs)
{
PIX  *pixr, *pixg, *pixb;
PIX  *pixrs, *pixgs, *pixbs;
PIX  *pixd;

    PROCNAME("pixScaleColor4xLI");

    if (!pixs || (pixGetDepth(pixs) != 32))
        return (PIX *)ERROR_PTR("pixs undefined or not 32 bpp", procName, NULL);

    pixr = pixGetRGBComponent(pixs, COLOR_RED);
    pixrs = pixScaleGray4xLI(pixr);
    pixDestroy(&pixr);
    pixg = pixGetRGBComponent(pixs, COLOR_GREEN);
    pixgs = pixScaleGray4xLI(pixg);
    pixDestroy(&pixg);
    pixb = pixGetRGBComponent(pixs, COLOR_BLUE);
    pixbs = pixScaleGray4xLI(pixb);
    pixDestroy(&pixb);

    if ((pixd = pixCreateRGBImage(pixrs, pixgs, pixbs)) == NULL) {
        L_ERROR("pixd not made\n", procName);
    } else {
        if (pixGetSpp(pixs) == 4)
            pixScaleAndTransferAlpha(pixd, pixs, 4.0, 4.0);
        pixCopyInputFormat(pixd, pixs);
    }

    pixDestroy(&pixrs);
    pixDestroy(&pixgs);
    pixDestroy(&pixbs);
    return pixd;
}


/*!
 * \brief   pixScaleGrayLI()
 *
 * \param[in]    pixs 8 bpp grayscale, no cmap
 * \param[in]    scalex, scaley must both be >= 0.7
 * \return  pixd, or NULL on error
 *
 *  This function is appropriate for upscaling magnification, where the
 *  scale factor is > 1, as well as for a small amount of downscaling
 *  reduction, with scale factor > 0.7.  If the scale factor is < 0.7,
 *  the best result is obtained by area mapping, but this is relatiely
 *  expensive.  A less expensive alternative with scale factor < 0.7
 *  is low-pass filtering followed by subsampling (pixScaleSmooth()),
 *  which is effectively a cheap form of area mapping.
 *
 *  Some more details follow.
 *
 *  For each pixel in the dest, this does a linear
 *  interpolation of 4 neighboring pixels in the src.
 *  Specifically, consider the UL corner of src and
 *  dest pixels.  The UL corner of the dest falls within
 *  a src pixel, whose four corners are the UL corners
 *  of 4 adjacent src pixels.  The value of the dest
 *  is taken by linear interpolation using the values of
 *  the four src pixels and the distance of the UL corner
 *  of the dest from each corner.
 *
 *  If the image is expanded so that the dest pixel is
 *  smaller than the src pixel, such interpolation
 *  is a reasonable approach.  This interpolation is
 *  also good for a small image reduction factor that
 *  is not more than a 2x reduction.
 *
 *  Note that the linear interpolation algorithm for scaling
 *  is identical in form to the area-mapping algorithm
 *  for grayscale rotation.  The latter corresponds to a
 *  translation of each pixel without scaling.
 *
 *  This function is NOT optimal if the scaling involves
 *  a large reduction.    If the image is significantly
 *  reduced, so that the dest pixel is much larger than
 *  the src pixels, this interpolation, which is over src
 *  pixels only near the UL corner of the dest pixel,
 *  is not going to give a good area-mapping average.
 *  Because area mapping for image scaling is considerably
 *  more computationally intensive than linear interpolation,
 *  we choose not to use it.   For large image reduction,
 *  linear interpolation over adjacent src pixels
 *  degenerates asymptotically to subsampling.  But
 *  subsampling without a low-pass pre-filter causes
 *  aliasing by the nyquist theorem.  To avoid aliasing,
 *  a low-pass filter e.g., an averaging filter of
 *  size roughly equal to the dest pixel i.e., the
 *  reduction factor should be applied to the src before
 *  subsampling.
 *
 *  As an alternative to low-pass filtering and subsampling
 *  for large reduction factors, linear interpolation can
 *  also be done between the widely separated src pixels in
 *  which the corners of the dest pixel lie.  This also is
 *  not optimal, as it samples src pixels only near the
 *  corners of the dest pixel, and it is not implemented.
 *
 *  The speed on circa 2005 Intel hardware for the general case (not 2x)
 *  is about 13 * 10^6 dest-pixels/sec/GHz.  The special 2x case runs
 *  at about 100 * 10^6 dest-pixels/sec/GHz.
 */
PIX *
pixScaleGrayLI(PIX       *pixs,
               l_float32  scalex,
               l_float32  scaley)
{
l_int32    ws, hs, wpls, wd, hd, wpld;
l_uint32  *datas, *datad;
l_float32  maxscale;
PIX       *pixd;

    PROCNAME("pixScaleGrayLI");

    if (!pixs || pixGetDepth(pixs) != 8 || pixGetColormap(pixs))
        return (PIX *)ERROR_PTR("pixs undefined, cmapped or not 8 bpp",
                                procName, NULL);
    maxscale = L_MAX(scalex, scaley);
    if (maxscale < 0.7) {
        L_WARNING("scaling factors < 0.7; do regular scaling\n", procName);
        return pixScaleGeneral(pixs, scalex, scaley, 0.0, 0);
    }

        /* Do fast special cases if possible */
    if (scalex == 1.0 && scaley == 1.0)
        return pixCopy(NULL, pixs);
    if (scalex == 2.0 && scaley == 2.0)
        return pixScaleGray2xLI(pixs);
    if (scalex == 4.0 && scaley == 4.0)
        return pixScaleGray4xLI(pixs);

        /* General case */
    pixGetDimensions(pixs, &ws, &hs, NULL);
    datas = pixGetData(pixs);
    wpls = pixGetWpl(pixs);
    wd = (l_int32)(scalex * (l_float32)ws + 0.5);
    hd = (l_int32)(scaley * (l_float32)hs + 0.5);
    if ((pixd = pixCreate(wd, hd, 8)) == NULL)
        return (PIX *)ERROR_PTR("pixd not made", procName, NULL);
    pixCopyText(pixd, pixs);
    pixCopyResolution(pixd, pixs);
    pixCopyInputFormat(pixd, pixs);
    pixScaleResolution(pixd, scalex, scaley);
    datad = pixGetData(pixd);
    wpld = pixGetWpl(pixd);
    scaleGrayLILow(datad, wd, hd, wpld, datas, ws, hs, wpls);
    return pixd;
}


/*!
 * \brief   pixScaleGray2xLI()
 *
 * \param[in]    pixs 8 bpp grayscale, not cmapped
 * \return  pixd, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This is a special case of gray linear interpolated scaling,
 *          for 2x upscaling.  It is about 6x faster than using
 *          the generic pixScaleGrayLI().
 *      (2) The speed on intel hardware is about
 *          100 * 10^6 dest-pixels/sec/GHz
 * </pre>
 */
PIX *
pixScaleGray2xLI(PIX  *pixs)
{
l_int32    ws, hs, wpls, wpld;
l_uint32  *datas, *datad;
PIX       *pixd;

    PROCNAME("pixScaleGray2xLI");

    if (!pixs || pixGetDepth(pixs) != 8 || pixGetColormap(pixs))
        return (PIX *)ERROR_PTR("pixs undefined, cmapped or not 8 bpp",
                                procName, NULL);

    pixGetDimensions(pixs, &ws, &hs, NULL);
    datas = pixGetData(pixs);
    wpls = pixGetWpl(pixs);
    if ((pixd = pixCreate(2 * ws, 2 * hs, 8)) == NULL)
        return (PIX *)ERROR_PTR("pixd not made", procName, NULL);
    pixCopyResolution(pixd, pixs);
    pixCopyInputFormat(pixd, pixs);
    pixScaleResolution(pixd, 2.0, 2.0);
    datad = pixGetData(pixd);
    wpld = pixGetWpl(pixd);
    scaleGray2xLILow(datad, wpld, datas, ws, hs, wpls);
    return pixd;
}


/*!
 * \brief   pixScaleGray4xLI()
 *
 * \param[in]    pixs 8 bpp grayscale, not cmapped
 * \return  pixd, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This is a special case of gray linear interpolated scaling,
 *          for 4x upscaling.  It is about 12x faster than using
 *          the generic pixScaleGrayLI().
 *      (2) The speed on intel hardware is about
 *          160 * 10^6 dest-pixels/sec/GHz.
 * </pre>
 */
PIX *
pixScaleGray4xLI(PIX  *pixs)
{
l_int32    ws, hs, wpls, wpld;
l_uint32  *datas, *datad;
PIX       *pixd;

    PROCNAME("pixScaleGray4xLI");

    if (!pixs || pixGetDepth(pixs) != 8 || pixGetColormap(pixs))
        return (PIX *)ERROR_PTR("pixs undefined, cmapped or not 8 bpp",
                                procName, NULL);

    pixGetDimensions(pixs, &ws, &hs, NULL);
    datas = pixGetData(pixs);
    wpls = pixGetWpl(pixs);
    if ((pixd = pixCreate(4 * ws, 4 * hs, 8)) == NULL)
        return (PIX *)ERROR_PTR("pixd not made", procName, NULL);
    pixCopyResolution(pixd, pixs);
    pixCopyInputFormat(pixd, pixs);
    pixScaleResolution(pixd, 4.0, 4.0);
    datad = pixGetData(pixd);
    wpld = pixGetWpl(pixd);
    scaleGray4xLILow(datad, wpld, datas, ws, hs, wpls);
    return pixd;
}


/*------------------------------------------------------------------*
 *                Scale 2x followed by binarization                 *
 *------------------------------------------------------------------*/
/*!
 * \brief   pixScaleGray2xLIThresh()
 *
 * \param[in]    pixs 8 bpp, not cmapped
 * \param[in]    thresh  between 0 and 256
 * \return  pixd 1 bpp, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This does 2x upscale on pixs, using linear interpolation,
 *          followed by thresholding to binary.
 *      (2) Buffers are used to avoid making a large grayscale image.
 * </pre>
 */
PIX *
pixScaleGray2xLIThresh(PIX     *pixs,
                       l_int32  thresh)
{
l_int32    i, ws, hs, hsm, wd, hd, wpls, wplb, wpld;
l_uint32  *datas, *datad, *lines, *lined, *lineb;
PIX       *pixd;

    PROCNAME("pixScaleGray2xLIThresh");

    if (!pixs || pixGetDepth(pixs) != 8 || pixGetColormap(pixs))
        return (PIX *)ERROR_PTR("pixs undefined, not 8 bpp, or cmapped",
                                procName, NULL);
    if (thresh < 0 || thresh > 256)
        return (PIX *)ERROR_PTR("thresh must be in [0, ... 256]",
            procName, NULL);

    pixGetDimensions(pixs, &ws, &hs, NULL);
    wd = 2 * ws;
    hd = 2 * hs;
    hsm = hs - 1;
    datas = pixGetData(pixs);
    wpls = pixGetWpl(pixs);

        /* Make line buffer for 2 lines of virtual intermediate image */
    wplb = (wd + 3) / 4;
    if ((lineb = (l_uint32 *)LEPT_CALLOC(2 * wplb, sizeof(l_uint32))) == NULL)
        return (PIX *)ERROR_PTR("lineb not made", procName, NULL);

        /* Make dest binary image */
    if ((pixd = pixCreate(wd, hd, 1)) == NULL) {
        LEPT_FREE(lineb);
        return (PIX *)ERROR_PTR("pixd not made", procName, NULL);
    }
    pixCopyInputFormat(pixd, pixs);
    pixCopyResolution(pixd, pixs);
    pixScaleResolution(pixd, 2.0, 2.0);
    wpld = pixGetWpl(pixd);
    datad = pixGetData(pixd);

        /* Do all but last src line */
    for (i = 0; i < hsm; i++) {
        lines = datas + i * wpls;
        lined = datad + 2 * i * wpld;  /* do 2 dest lines at a time */
        scaleGray2xLILineLow(lineb, wplb, lines, ws, wpls, 0);
        thresholdToBinaryLineLow(lined, wd, lineb, 8, thresh);
        thresholdToBinaryLineLow(lined + wpld, wd, lineb + wplb, 8, thresh);
    }

        /* Do last src line */
    lines = datas + hsm * wpls;
    lined = datad + 2 * hsm * wpld;
    scaleGray2xLILineLow(lineb, wplb, lines, ws, wpls, 1);
    thresholdToBinaryLineLow(lined, wd, lineb, 8, thresh);
    thresholdToBinaryLineLow(lined + wpld, wd, lineb + wplb, 8, thresh);

    LEPT_FREE(lineb);
    return pixd;
}


/*!
 * \brief   pixScaleGray2xLIDither()
 *
 * \param[in]    pixs 8 bpp, not cmapped
 * \return  pixd 1 bpp, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This does 2x upscale on pixs, using linear interpolation,
 *          followed by Floyd-Steinberg dithering to binary.
 *      (2) Buffers are used to avoid making a large grayscale image.
 *          ~ Two line buffers are used for the src, required for the 2x
 *            LI upscale.
 *          ~ Three line buffers are used for the intermediate image.
 *            Two are filled with each 2xLI row operation; the third is
 *            needed because the upscale and dithering ops are out of sync.
 * </pre>
 */
PIX *
pixScaleGray2xLIDither(PIX  *pixs)
{
l_int32    i, ws, hs, hsm, wd, hd, wpls, wplb, wpld;
l_uint32  *datas, *datad;
l_uint32  *lined;
l_uint32  *lineb = NULL;   /* 2 intermediate buffer lines */
l_uint32  *linebp = NULL;  /* 1 intermediate buffer line */
l_uint32  *bufs = NULL;    /* 2 source buffer lines */
PIX       *pixd = NULL;

    PROCNAME("pixScaleGray2xLIDither");

    if (!pixs || pixGetDepth(pixs) != 8 || pixGetColormap(pixs))
        return (PIX *)ERROR_PTR("pixs undefined, not 8 bpp, or cmapped",
                                procName, NULL);

    pixGetDimensions(pixs, &ws, &hs, NULL);
    wd = 2 * ws;
    hd = 2 * hs;
    hsm = hs - 1;
    datas = pixGetData(pixs);
    wpls = pixGetWpl(pixs);

        /* Make line buffers for 2 lines of src image */
    if ((bufs = (l_uint32 *)LEPT_CALLOC(2 * wpls, sizeof(l_uint32))) == NULL)
        return (PIX *)ERROR_PTR("bufs not made", procName, NULL);

        /* Make line buffer for 2 lines of virtual intermediate image */
    wplb = (wd + 3) / 4;
    if ((lineb = (l_uint32 *)LEPT_CALLOC(2 * wplb, sizeof(l_uint32))) == NULL) {
        L_ERROR("lineb not made\n", procName);
        goto cleanup;
    }

        /* Make line buffer for 1 line of virtual intermediate image */
    if ((linebp = (l_uint32 *)LEPT_CALLOC(wplb, sizeof(l_uint32))) == NULL) {
        L_ERROR("linebp not made\n", procName);
        goto cleanup;
    }

        /* Make dest binary image */
    if ((pixd = pixCreate(wd, hd, 1)) == NULL) {
        L_ERROR("pixd not made\n", procName);
        goto cleanup;
    }
    pixCopyInputFormat(pixd, pixs);
    pixCopyResolution(pixd, pixs);
    pixScaleResolution(pixd, 2.0, 2.0);
    wpld = pixGetWpl(pixd);
    datad = pixGetData(pixd);

        /* Start with the first src and the first dest line */
    memcpy(bufs, datas, 4 * wpls);   /* first src line */
    memcpy(bufs + wpls, datas + wpls, 4 * wpls);  /* 2nd src line */
    scaleGray2xLILineLow(lineb, wplb, bufs, ws, wpls, 0);  /* 2 i lines */
    lined = datad;
    ditherToBinaryLineLow(lined, wd, lineb, lineb + wplb,
                          DEFAULT_CLIP_LOWER_1, DEFAULT_CLIP_UPPER_1, 0);
                                                    /* 1st d line */

        /* Do all but last src line */
    for (i = 1; i < hsm; i++) {
        memcpy(bufs, datas + i * wpls, 4 * wpls);  /* i-th src line */
        memcpy(bufs + wpls, datas + (i + 1) * wpls, 4 * wpls);
        memcpy(linebp, lineb + wplb, 4 * wplb);
        scaleGray2xLILineLow(lineb, wplb, bufs, ws, wpls, 0);  /* 2 i lines */
        lined = datad + 2 * i * wpld;
        ditherToBinaryLineLow(lined - wpld, wd, linebp, lineb,
                              DEFAULT_CLIP_LOWER_1, DEFAULT_CLIP_UPPER_1, 0);
                                                   /* odd dest line */
        ditherToBinaryLineLow(lined, wd, lineb, lineb + wplb,
                              DEFAULT_CLIP_LOWER_1, DEFAULT_CLIP_UPPER_1, 0);
                                                   /* even dest line */
    }

        /* Do the last src line and the last 3 dest lines */
    memcpy(bufs, datas + hsm * wpls, 4 * wpls);  /* hsm-th src line */
    memcpy(linebp, lineb + wplb, 4 * wplb);   /* 1 i line */
    scaleGray2xLILineLow(lineb, wplb, bufs, ws, wpls, 1);  /* 2 i lines */
    ditherToBinaryLineLow(lined + wpld, wd, linebp, lineb,
                          DEFAULT_CLIP_LOWER_1, DEFAULT_CLIP_UPPER_1, 0);
                                                   /* odd dest line */
    ditherToBinaryLineLow(lined + 2 * wpld, wd, lineb, lineb + wplb,
                          DEFAULT_CLIP_LOWER_1, DEFAULT_CLIP_UPPER_1, 0);
                                                   /* even dest line */
    ditherToBinaryLineLow(lined + 3 * wpld, wd, lineb + wplb, NULL,
                          DEFAULT_CLIP_LOWER_1, DEFAULT_CLIP_UPPER_1, 1);
                                                   /* last dest line */

cleanup:
    LEPT_FREE(bufs);
    LEPT_FREE(lineb);
    LEPT_FREE(linebp);
    return pixd;
}


/*------------------------------------------------------------------*
 *                Scale 4x followed by binarization                 *
 *------------------------------------------------------------------*/
/*!
 * \brief   pixScaleGray4xLIThresh()
 *
 * \param[in]    pixs 8 bpp
 * \param[in]    thresh  between 0 and 256
 * \return  pixd 1 bpp, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This does 4x upscale on pixs, using linear interpolation,
 *          followed by thresholding to binary.
 *      (2) Buffers are used to avoid making a large grayscale image.
 *      (3) If a full 4x expanded grayscale image can be kept in memory,
 *          this function is only about 10% faster than separately doing
 *          a linear interpolation to a large grayscale image, followed
 *          by thresholding to binary.
 * </pre>
 */
PIX *
pixScaleGray4xLIThresh(PIX     *pixs,
                       l_int32  thresh)
{
l_int32    i, j, ws, hs, hsm, wd, hd, wpls, wplb, wpld;
l_uint32  *datas, *datad, *lines, *lined, *lineb;
PIX       *pixd;

    PROCNAME("pixScaleGray4xLIThresh");

    if (!pixs || pixGetDepth(pixs) != 8 || pixGetColormap(pixs))
        return (PIX *)ERROR_PTR("pixs undefined, not 8 bpp, or cmapped",
                                procName, NULL);
    if (thresh < 0 || thresh > 256)
        return (PIX *)ERROR_PTR("thresh must be in [0, ... 256]",
            procName, NULL);

    pixGetDimensions(pixs, &ws, &hs, NULL);
    wd = 4 * ws;
    hd = 4 * hs;
    hsm = hs - 1;
    datas = pixGetData(pixs);
    wpls = pixGetWpl(pixs);

        /* Make line buffer for 4 lines of virtual intermediate image */
    wplb = (wd + 3) / 4;
    if ((lineb = (l_uint32 *)LEPT_CALLOC(4 * wplb, sizeof(l_uint32))) == NULL)
        return (PIX *)ERROR_PTR("lineb not made", procName, NULL);

        /* Make dest binary image */
    if ((pixd = pixCreate(wd, hd, 1)) == NULL) {
        LEPT_FREE(lineb);
        return (PIX *)ERROR_PTR("pixd not made", procName, NULL);
    }
    pixCopyInputFormat(pixd, pixs);
    pixCopyResolution(pixd, pixs);
    pixScaleResolution(pixd, 4.0, 4.0);
    wpld = pixGetWpl(pixd);
    datad = pixGetData(pixd);

        /* Do all but last src line */
    for (i = 0; i < hsm; i++) {
        lines = datas + i * wpls;
        lined = datad + 4 * i * wpld;  /* do 4 dest lines at a time */
        scaleGray4xLILineLow(lineb, wplb, lines, ws, wpls, 0);
        for (j = 0; j < 4; j++) {
            thresholdToBinaryLineLow(lined + j * wpld, wd,
                                     lineb + j * wplb, 8, thresh);
        }
    }

        /* Do last src line */
    lines = datas + hsm * wpls;
    lined = datad + 4 * hsm * wpld;
    scaleGray4xLILineLow(lineb, wplb, lines, ws, wpls, 1);
    for (j = 0; j < 4; j++) {
        thresholdToBinaryLineLow(lined + j * wpld, wd,
                                 lineb + j * wplb, 8, thresh);
    }

    LEPT_FREE(lineb);
    return pixd;
}


/*!
 * \brief   pixScaleGray4xLIDither()
 *
 * \param[in]    pixs 8 bpp, not cmapped
 * \return  pixd 1 bpp, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This does 4x upscale on pixs, using linear interpolation,
 *          followed by Floyd-Steinberg dithering to binary.
 *      (2) Buffers are used to avoid making a large grayscale image.
 *          ~ Two line buffers are used for the src, required for the
 *            4xLI upscale.
 *          ~ Five line buffers are used for the intermediate image.
 *            Four are filled with each 4xLI row operation; the fifth
 *            is needed because the upscale and dithering ops are
 *            out of sync.
 *      (3) If a full 4x expanded grayscale image can be kept in memory,
 *          this function is only about 5% faster than separately doing
 *          a linear interpolation to a large grayscale image, followed
 *          by error-diffusion dithering to binary.
 * </pre>
 */
PIX *
pixScaleGray4xLIDither(PIX  *pixs)
{
l_int32    i, j, ws, hs, hsm, wd, hd, wpls, wplb, wpld;
l_uint32  *datas, *datad;
l_uint32  *lined;
l_uint32  *lineb = NULL;   /* 4 intermediate buffer lines */
l_uint32  *linebp = NULL;  /* 1 intermediate buffer line */
l_uint32  *bufs = NULL;    /* 2 source buffer lines */
PIX       *pixd = NULL;

    PROCNAME("pixScaleGray4xLIDither");

    if (!pixs || pixGetDepth(pixs) != 8 || pixGetColormap(pixs))
        return (PIX *)ERROR_PTR("pixs undefined, not 8 bpp, or cmapped",
                                procName, NULL);

    pixGetDimensions(pixs, &ws, &hs, NULL);
    wd = 4 * ws;
    hd = 4 * hs;
    hsm = hs - 1;
    datas = pixGetData(pixs);
    wpls = pixGetWpl(pixs);

        /* Make line buffers for 2 lines of src image */
    if ((bufs = (l_uint32 *)LEPT_CALLOC(2 * wpls, sizeof(l_uint32))) == NULL)
        return (PIX *)ERROR_PTR("bufs not made", procName, NULL);

        /* Make line buffer for 4 lines of virtual intermediate image */
    wplb = (wd + 3) / 4;
    if ((lineb = (l_uint32 *)LEPT_CALLOC(4 * wplb, sizeof(l_uint32))) == NULL) {
        L_ERROR("lineb not made\n", procName);
        goto cleanup;
    }

        /* Make line buffer for 1 line of virtual intermediate image */
    if ((linebp = (l_uint32 *)LEPT_CALLOC(wplb, sizeof(l_uint32))) == NULL) {
        L_ERROR("linebp not made\n", procName);
        goto cleanup;
    }

        /* Make dest binary image */
    if ((pixd = pixCreate(wd, hd, 1)) == NULL) {
        L_ERROR("pixd not made\n", procName);
        goto cleanup;
    }
    pixCopyInputFormat(pixd, pixs);
    pixCopyResolution(pixd, pixs);
    pixScaleResolution(pixd, 4.0, 4.0);
    wpld = pixGetWpl(pixd);
    datad = pixGetData(pixd);

        /* Start with the first src and the first 3 dest lines */
    memcpy(bufs, datas, 4 * wpls);   /* first src line */
    memcpy(bufs + wpls, datas + wpls, 4 * wpls);  /* 2nd src line */
    scaleGray4xLILineLow(lineb, wplb, bufs, ws, wpls, 0);  /* 4 b lines */
    lined = datad;
    for (j = 0; j < 3; j++) {  /* first 3 d lines of Q */
        ditherToBinaryLineLow(lined + j * wpld, wd, lineb + j * wplb,
                              lineb + (j + 1) * wplb,
                              DEFAULT_CLIP_LOWER_1, DEFAULT_CLIP_UPPER_1, 0);
    }

        /* Do all but last src line */
    for (i = 1; i < hsm; i++) {
        memcpy(bufs, datas + i * wpls, 4 * wpls);  /* i-th src line */
        memcpy(bufs + wpls, datas + (i + 1) * wpls, 4 * wpls);
        memcpy(linebp, lineb + 3 * wplb, 4 * wplb);
        scaleGray4xLILineLow(lineb, wplb, bufs, ws, wpls, 0);  /* 4 b lines */
        lined = datad + 4 * i * wpld;
        ditherToBinaryLineLow(lined - wpld, wd, linebp, lineb,
                              DEFAULT_CLIP_LOWER_1, DEFAULT_CLIP_UPPER_1, 0);
                                                     /* 4th dest line of Q */
        for (j = 0; j < 3; j++) {  /* next 3 d lines of Quad */
            ditherToBinaryLineLow(lined + j * wpld, wd, lineb + j * wplb,
                                  lineb + (j + 1) * wplb,
                                 DEFAULT_CLIP_LOWER_1, DEFAULT_CLIP_UPPER_1, 0);
        }
    }

        /* Do the last src line and the last 5 dest lines */
    memcpy(bufs, datas + hsm * wpls, 4 * wpls);  /* hsm-th src line */
    memcpy(linebp, lineb + 3 * wplb, 4 * wplb);   /* 1 b line */
    scaleGray4xLILineLow(lineb, wplb, bufs, ws, wpls, 1);  /* 4 b lines */
    lined = datad + 4 * hsm * wpld;
    ditherToBinaryLineLow(lined - wpld, wd, linebp, lineb,
                          DEFAULT_CLIP_LOWER_1, DEFAULT_CLIP_UPPER_1, 0);
                                                   /* 4th dest line of Q */
    for (j = 0; j < 3; j++) {  /* next 3 d lines of Quad */
        ditherToBinaryLineLow(lined + j * wpld, wd, lineb + j * wplb,
                              lineb + (j + 1) * wplb,
                              DEFAULT_CLIP_LOWER_1, DEFAULT_CLIP_UPPER_1, 0);
    }
        /* And finally, the last dest line */
    ditherToBinaryLineLow(lined + 3 * wpld, wd, lineb + 3 * wplb, NULL,
                              DEFAULT_CLIP_LOWER_1, DEFAULT_CLIP_UPPER_1, 1);

cleanup:
    LEPT_FREE(bufs);
    LEPT_FREE(lineb);
    LEPT_FREE(linebp);
    return pixd;
}


/*------------------------------------------------------------------*
 *                  Scaling by closest pixel sampling               *
 *------------------------------------------------------------------*/
/*!
 * \brief   pixScaleBySampling()
 *
 * \param[in]    pixs 1, 2, 4, 8, 16, 32 bpp
 * \param[in]    scalex, scaley both > 0.0
 * \return  pixd, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This function samples from the source without
 *          filtering.  As a result, aliasing will occur for
 *          subsampling (%scalex and/or %scaley < 1.0).
 *      (2) If %scalex == 1.0 and %scaley == 1.0, returns a copy.
 * </pre>
 */
PIX *
pixScaleBySampling(PIX       *pixs,
                   l_float32  scalex,
                   l_float32  scaley)
{
l_int32    ws, hs, d, wpls, wd, hd, wpld;
l_uint32  *datas, *datad;
PIX       *pixd;

    PROCNAME("pixScaleBySampling");

    if (!pixs)
        return (PIX *)ERROR_PTR("pixs not defined", procName, NULL);
    if (scalex <= 0.0 || scaley <= 0.0)
        return (PIX *)ERROR_PTR("scale factor <= 0", procName, NULL);
    if (scalex == 1.0 && scaley == 1.0)
        return pixCopy(NULL, pixs);
    if ((d = pixGetDepth(pixs)) == 1)
        return pixScaleBinary(pixs, scalex, scaley);

    pixGetDimensions(pixs, &ws, &hs, NULL);
    datas = pixGetData(pixs);
    wpls = pixGetWpl(pixs);
    wd = (l_int32)(scalex * (l_float32)ws + 0.5);
    hd = (l_int32)(scaley * (l_float32)hs + 0.5);
    if ((pixd = pixCreate(wd, hd, d)) == NULL)
        return (PIX *)ERROR_PTR("pixd not made", procName, NULL);
    pixCopyResolution(pixd, pixs);
    pixScaleResolution(pixd, scalex, scaley);
    pixCopyColormap(pixd, pixs);
    pixCopyText(pixd, pixs);
    pixCopyInputFormat(pixd, pixs);
    pixCopySpp(pixd, pixs);
    datad = pixGetData(pixd);
    wpld = pixGetWpl(pixd);
    scaleBySamplingLow(datad, wd, hd, wpld, datas, ws, hs, d, wpls);
    if (d == 32 && pixGetSpp(pixs) == 4)
        pixScaleAndTransferAlpha(pixd, pixs, scalex, scaley);

    return pixd;
}


/*!
 * \brief   pixScaleBySamplingToSize()
 *
 * \param[in]    pixs 1, 2, 4, 8, 16 and 32 bpp
 * \param[in]    wd  target width; use 0 if using height as target
 * \param[in]    hd  target height; use 0 if using width as target
 * \return  pixd, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This guarantees that the output scaled image has the
 *          dimension(s) you specify.
 *           ~ To specify the width with isotropic scaling, set %hd = 0.
 *           ~ To specify the height with isotropic scaling, set %wd = 0.
 *           ~ If both %wd and %hd are specified, the image is scaled
 *             (in general, anisotropically) to that size.
 *           ~ It is an error to set both %wd and %hd to 0.
 * </pre>
 */
PIX *
pixScaleBySamplingToSize(PIX     *pixs,
                         l_int32  wd,
                         l_int32  hd)
{
l_int32    w, h;
l_float32  scalex, scaley;

    PROCNAME("pixScaleBySamplingToSize");

    if (!pixs)
        return (PIX *)ERROR_PTR("pixs not defined", procName, NULL);
    if (wd <= 0 && hd <= 0)
        return (PIX *)ERROR_PTR("neither wd nor hd > 0", procName, NULL);

    pixGetDimensions(pixs, &w, &h, NULL);
    if (wd <= 0) {
        scaley = (l_float32)hd / (l_float32)h;
        scalex = scaley;
    } else if (hd <= 0) {
        scalex = (l_float32)wd / (l_float32)w;
        scaley = scalex;
    } else {
        scalex = (l_float32)wd / (l_float32)w;
        scaley = (l_float32)hd / (l_float32)h;
    }

    return pixScaleBySampling(pixs, scalex, scaley);
}


/*!
 * \brief   pixScaleByIntSampling()
 *
 * \param[in]    pixs 1, 2, 4, 8, 16, 32 bpp
 * \param[in]    factor integer subsampling
 * \return  pixd, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) Simple interface to pixScaleBySampling(), for
 *          isotropic integer reduction.
 *      (2) If %factor == 1, returns a copy.
 * </pre>
 */
PIX *
pixScaleByIntSampling(PIX     *pixs,
                      l_int32  factor)
{
l_float32  scale;

    PROCNAME("pixScaleByIntSampling");

    if (!pixs)
        return (PIX *)ERROR_PTR("pixs not defined", procName, NULL);
    if (factor <= 1) {
        if (factor < 1)
            L_ERROR("factor must be >= 1; returning a copy\n", procName);
        return pixCopy(NULL, pixs);
    }

    scale = 1. / (l_float32)factor;
    return pixScaleBySampling(pixs, scale, scale);
}


/*------------------------------------------------------------------*
 *            Fast integer factor subsampling RGB to gray           *
 *------------------------------------------------------------------*/
/*!
 * \brief   pixScaleRGBToGrayFast()
 *
 * \param[in]    pixs 32 bpp rgb
 * \param[in]    factor integer reduction factor >= 1
 * \param[in]    color one of COLOR_RED, COLOR_GREEN, COLOR_BLUE
 * \return  pixd 8 bpp, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This does simultaneous subsampling by an integer factor and
 *          extraction of the color from the RGB pix.
 *      (2) It is designed for maximum speed, and is used for quickly
 *          generating a downsized grayscale image from a higher resolution
 *          RGB image.  This would typically be used for image analysis.
 *      (3) The standard color byte order (RGBA) is assumed.
 * </pre>
 */
PIX *
pixScaleRGBToGrayFast(PIX     *pixs,
                      l_int32  factor,
                      l_int32  color)
{
l_int32    byteval, shift;
l_int32    i, j, ws, hs, wd, hd, wpls, wpld;
l_uint32  *datas, *words, *datad, *lined;
l_float32  scale;
PIX       *pixd;

    PROCNAME("pixScaleRGBToGrayFast");

    if (!pixs)
        return (PIX *)ERROR_PTR("pixs not defined", procName, NULL);
    if (pixGetDepth(pixs) != 32)
        return (PIX *)ERROR_PTR("depth not 32 bpp", procName, NULL);
    if (factor < 1)
        return (PIX *)ERROR_PTR("factor must be >= 1", procName, NULL);

    if (color == COLOR_RED)
        shift = L_RED_SHIFT;
    else if (color == COLOR_GREEN)
        shift = L_GREEN_SHIFT;
    else if (color == COLOR_BLUE)
        shift = L_BLUE_SHIFT;
    else
        return (PIX *)ERROR_PTR("invalid color", procName, NULL);

    pixGetDimensions(pixs, &ws, &hs, NULL);
    datas = pixGetData(pixs);
    wpls = pixGetWpl(pixs);

    wd = ws / factor;
    hd = hs / factor;
    if ((pixd = pixCreate(wd, hd, 8)) == NULL)
        return (PIX *)ERROR_PTR("pixd not made", procName, NULL);
    pixCopyResolution(pixd, pixs);
    pixCopyInputFormat(pixd, pixs);
    scale = 1. / (l_float32) factor;
    pixScaleResolution(pixd, scale, scale);
    datad = pixGetData(pixd);
    wpld = pixGetWpl(pixd);

    for (i = 0; i < hd; i++) {
        words = datas + i * factor * wpls;
        lined = datad + i * wpld;
        for (j = 0; j < wd; j++, words += factor) {
            byteval = ((*words) >> shift) & 0xff;
            SET_DATA_BYTE(lined, j, byteval);
        }
    }

    return pixd;
}


/*!
 * \brief   pixScaleRGBToBinaryFast()
 *
 * \param[in]    pixs 32 bpp RGB
 * \param[in]    factor integer reduction factor >= 1
 * \param[in]    thresh binarization threshold
 * \return  pixd 1 bpp, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This does simultaneous subsampling by an integer factor and
 *          conversion from RGB to gray to binary.
 *      (2) It is designed for maximum speed, and is used for quickly
 *          generating a downsized binary image from a higher resolution
 *          RGB image.  This would typically be used for image analysis.
 *      (3) It uses the green channel to represent the RGB pixel intensity.
 * </pre>
 */
PIX *
pixScaleRGBToBinaryFast(PIX     *pixs,
                        l_int32  factor,
                        l_int32  thresh)
{
l_int32    byteval;
l_int32    i, j, ws, hs, wd, hd, wpls, wpld;
l_uint32  *datas, *words, *datad, *lined;
l_float32  scale;
PIX       *pixd;

    PROCNAME("pixScaleRGBToBinaryFast");

    if (!pixs)
        return (PIX *)ERROR_PTR("pixs not defined", procName, NULL);
    if (factor < 1)
        return (PIX *)ERROR_PTR("factor must be >= 1", procName, NULL);
    if (pixGetDepth(pixs) != 32)
        return (PIX *)ERROR_PTR("depth not 32 bpp", procName, NULL);

    pixGetDimensions(pixs, &ws, &hs, NULL);
    datas = pixGetData(pixs);
    wpls = pixGetWpl(pixs);

    wd = ws / factor;
    hd = hs / factor;
    if ((pixd = pixCreate(wd, hd, 1)) == NULL)
        return (PIX *)ERROR_PTR("pixd not made", procName, NULL);
    pixCopyResolution(pixd, pixs);
    pixCopyInputFormat(pixd, pixs);
    scale = 1. / (l_float32) factor;
    pixScaleResolution(pixd, scale, scale);
    datad = pixGetData(pixd);
    wpld = pixGetWpl(pixd);

    for (i = 0; i < hd; i++) {
        words = datas + i * factor * wpls;
        lined = datad + i * wpld;
        for (j = 0; j < wd; j++, words += factor) {
            byteval = ((*words) >> L_GREEN_SHIFT) & 0xff;
            if (byteval < thresh)
                SET_DATA_BIT(lined, j);
        }
    }

    return pixd;
}


/*!
 * \brief   pixScaleGrayToBinaryFast()
 *
 * \param[in]    pixs 8 bpp grayscale
 * \param[in]    factor integer reduction factor >= 1
 * \param[in]    thresh binarization threshold
 * \return  pixd 1 bpp, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This does simultaneous subsampling by an integer factor and
 *          thresholding from gray to binary.
 *      (2) It is designed for maximum speed, and is used for quickly
 *          generating a downsized binary image from a higher resolution
 *          gray image.  This would typically be used for image analysis.
 * </pre>
 */
PIX *
pixScaleGrayToBinaryFast(PIX     *pixs,
                         l_int32  factor,
                         l_int32  thresh)
{
l_int32    byteval;
l_int32    i, j, ws, hs, wd, hd, wpls, wpld, sj;
l_uint32  *datas, *datad, *lines, *lined;
l_float32  scale;
PIX       *pixd;

    PROCNAME("pixScaleGrayToBinaryFast");

    if (!pixs)
        return (PIX *)ERROR_PTR("pixs not defined", procName, NULL);
    if (factor < 1)
        return (PIX *)ERROR_PTR("factor must be >= 1", procName, NULL);
    if (pixGetDepth(pixs) != 8)
        return (PIX *)ERROR_PTR("depth not 8 bpp", procName, NULL);

    pixGetDimensions(pixs, &ws, &hs, NULL);
    datas = pixGetData(pixs);
    wpls = pixGetWpl(pixs);

    wd = ws / factor;
    hd = hs / factor;
    if ((pixd = pixCreate(wd, hd, 1)) == NULL)
        return (PIX *)ERROR_PTR("pixd not made", procName, NULL);
    pixCopyResolution(pixd, pixs);
    pixCopyInputFormat(pixd, pixs);
    scale = 1. / (l_float32) factor;
    pixScaleResolution(pixd, scale, scale);
    datad = pixGetData(pixd);
    wpld = pixGetWpl(pixd);

    for (i = 0; i < hd; i++) {
        lines = datas + i * factor * wpls;
        lined = datad + i * wpld;
        for (j = 0, sj = 0; j < wd; j++, sj += factor) {
            byteval = GET_DATA_BYTE(lines, sj);
            if (byteval < thresh)
                SET_DATA_BIT(lined, j);
        }
    }

    return pixd;
}


/*------------------------------------------------------------------*
 *               Downscaling with (antialias) smoothing             *
 *------------------------------------------------------------------*/
/*!
 * \brief   pixScaleSmooth()
 *
 * \param[in]    pix 2, 4, 8 or 32 bpp; and 2, 4, 8 bpp with colormap
 * \param[in]    scalex, scaley must both be < 0.7
 * \return  pixd, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This function should only be used when the scale factors are less
 *          than or equal to 0.7 (i.e., more than about 1.42x reduction).
 *          If either scale factor is larger than 0.7, we issue a warning
 *          and call pixScaleGeneral(), which will invoke linear
 *          interpolation without sharpening.
 *      (2) This works only on 2, 4, 8 and 32 bpp images, and if there is
 *          a colormap, it is removed by converting to RGB.  In other
 *          cases, we issue a warning and call pixScaleGeneral().
 *      (3) It does simple (flat filter) convolution, with a filter size
 *          commensurate with the amount of reduction, to avoid antialiasing.
 *      (4) It does simple subsampling after smoothing, which is appropriate
 *          for this range of scaling.  Linear interpolation gives essentially
 *          the same result with more computation for these scale factors,
 *          so we don't use it.
 *      (5) The result is the same as doing a full block convolution followed by
 *          subsampling, but this is faster because the results of the block
 *          convolution are only computed at the subsampling locations.
 *          In fact, the computation time is approximately independent of
 *          the scale factor, because the convolution kernel is adjusted
 *          so that each source pixel is summed approximately once.
 * </pre>
 */
PIX *
pixScaleSmooth(PIX       *pix,
               l_float32  scalex,
               l_float32  scaley)
{
l_int32    ws, hs, d, wd, hd, wpls, wpld, isize;
l_uint32  *datas, *datad;
l_float32  minscale, size;
PIX       *pixs, *pixd;

    PROCNAME("pixScaleSmooth");

    if (!pix)
        return (PIX *)ERROR_PTR("pix not defined", procName, NULL);
    if (scalex >= 0.7 || scaley >= 0.7) {
        L_WARNING("scaling factor not < 0.7; do regular scaling\n", procName);
        return pixScaleGeneral(pix, scalex, scaley, 0.0, 0);
    }

        /* Remove colormap if necessary.
         * If 2 bpp or 4 bpp gray, convert to 8 bpp */
    d = pixGetDepth(pix);
    if ((d == 2 || d == 4 || d == 8) && pixGetColormap(pix)) {
        L_WARNING("pix has colormap; removing\n", procName);
        pixs = pixRemoveColormap(pix, REMOVE_CMAP_BASED_ON_SRC);
        d = pixGetDepth(pixs);
    } else if (d == 2 || d == 4) {
        pixs = pixConvertTo8(pix, FALSE);
        d = 8;
    } else {
        pixs = pixClone(pix);
    }

    if (d != 8 && d != 32) {   /* d == 1 or d == 16 */
        L_WARNING("depth not 8 or 32 bpp; do regular scaling\n", procName);
        pixDestroy(&pixs);
        return pixScaleGeneral(pix, scalex, scaley, 0.0, 0);
    }

        /* If 1.42 < 1/minscale < 2.5, use isize = 2
         * If 2.5 =< 1/minscale < 3.5, use isize = 3, etc.
         * Under no conditions use isize < 2  */
    minscale = L_MIN(scalex, scaley);
    size = 1.0 / minscale;   /* ideal filter full width */
    isize = L_MAX(2, (l_int32)(size + 0.5));

    pixGetDimensions(pixs, &ws, &hs, NULL);
    if ((ws < isize) || (hs < isize)) {
        pixDestroy(&pixs);
        return (PIX *)ERROR_PTR("pixs too small", procName, NULL);
    }
    datas = pixGetData(pixs);
    wpls = pixGetWpl(pixs);
    wd = (l_int32)(scalex * (l_float32)ws + 0.5);
    hd = (l_int32)(scaley * (l_float32)hs + 0.5);
    if (wd < 1 || hd < 1) {
        pixDestroy(&pixs);
        return (PIX *)ERROR_PTR("pixd too small", procName, NULL);
    }
    if ((pixd = pixCreate(wd, hd, d)) == NULL) {
        pixDestroy(&pixs);
        return (PIX *)ERROR_PTR("pixd not made", procName, NULL);
    }
    pixCopyResolution(pixd, pixs);
    pixCopyInputFormat(pixd, pixs);
    pixScaleResolution(pixd, scalex, scaley);
    datad = pixGetData(pixd);
    wpld = pixGetWpl(pixd);
    scaleSmoothLow(datad, wd, hd, wpld, datas, ws, hs, d, wpls, isize);
    if (d == 32 && pixGetSpp(pixs) == 4)
        pixScaleAndTransferAlpha(pixd, pixs, scalex, scaley);

    pixDestroy(&pixs);
    return pixd;
}


/*!
 * \brief   pixScaleSmoothToSize()
 *
 * \param[in]    pix 2, 4, 8 or 32 bpp; and 2, 4, 8 bpp with colormap
 * \param[in]    wd  target width; use 0 if using height as target
 * \param[in]    hd  target height; use 0 if using width as target
 * \return  pixd, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) See notes in pixScaleSmooth().
 *      (2) The output scaled image has the dimension(s) you specify:
 *          * To specify the width with isotropic scaling, set %hd = 0.
 *          * To specify the height with isotropic scaling, set %wd = 0.
 *          * If both %wd and %hd are specified, the image is scaled
 *             (in general, anisotropically) to that size.
 *          * It is an error to set both %wd and %hd to 0.
 * </pre>
 */
PIX *
pixScaleSmoothToSize(PIX     *pixs,
                     l_int32  wd,
                     l_int32  hd)
{
l_int32    w, h;
l_float32  scalex, scaley;

    PROCNAME("pixScaleSmoothToSize");

    if (!pixs)
        return (PIX *)ERROR_PTR("pixs not defined", procName, NULL);
    if (wd <= 0 && hd <= 0)
        return (PIX *)ERROR_PTR("neither wd nor hd > 0", procName, NULL);

    pixGetDimensions(pixs, &w, &h, NULL);
    if (wd <= 0) {
        scaley = (l_float32)hd / (l_float32)h;
        scalex = scaley;
    } else if (hd <= 0) {
        scalex = (l_float32)wd / (l_float32)w;
        scaley = scalex;
    } else {
        scalex = (l_float32)wd / (l_float32)w;
        scaley = (l_float32)hd / (l_float32)h;
    }

    return pixScaleSmooth(pixs, scalex, scaley);
}


/*!
 * \brief   pixScaleRGBToGray2()
 *
 * \param[in]    pixs 32 bpp rgb
 * \param[in]    rwt, gwt, bwt must sum to 1.0
 * \return  pixd, 8 bpp, 2x reduced, or NULL on error
 */
PIX *
pixScaleRGBToGray2(PIX       *pixs,
                   l_float32  rwt,
                   l_float32  gwt,
                   l_float32  bwt)
{
l_int32    wd, hd, wpls, wpld;
l_uint32  *datas, *datad;
PIX       *pixd;

    PROCNAME("pixScaleRGBToGray2");

    if (!pixs)
        return (PIX *)ERROR_PTR("pixs not defined", procName, NULL);
    if (pixGetDepth(pixs) != 32)
        return (PIX *)ERROR_PTR("pixs not 32 bpp", procName, NULL);
    if (rwt + gwt + bwt < 0.98 || rwt + gwt + bwt > 1.02)
        return (PIX *)ERROR_PTR("sum of wts should be 1.0", procName, NULL);

    wd = pixGetWidth(pixs) / 2;
    hd = pixGetHeight(pixs) / 2;
    wpls = pixGetWpl(pixs);
    datas = pixGetData(pixs);
    if ((pixd = pixCreate(wd, hd, 8)) == NULL)
        return (PIX *)ERROR_PTR("pixd not made", procName, NULL);
    pixCopyResolution(pixd, pixs);
    pixCopyInputFormat(pixd, pixs);
    pixScaleResolution(pixd, 0.5, 0.5);
    wpld = pixGetWpl(pixd);
    datad = pixGetData(pixd);
    scaleRGBToGray2Low(datad, wd, hd, wpld, datas, wpls, rwt, gwt, bwt);
    return pixd;
}


/*------------------------------------------------------------------*
 *             Downscaling with (antialias) area mapping            *
 *------------------------------------------------------------------*/
/*!
 * \brief   pixScaleAreaMap()
 *
 * \param[in]    pix 2, 4, 8 or 32 bpp; and 2, 4, 8 bpp with colormap
 * \param[in]    scalex, scaley must both be <= 0.7
 * \return  pixd, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This function should only be used when the scale factors are less
 *          than or equal to 0.7 (i.e., more than about 1.42x reduction).
 *          If either scale factor is larger than 0.7, we issue a warning
 *          and call pixScaleGeneral(), which will invoke linear
 *          interpolation without sharpening.
 *      (2) This works only on 2, 4, 8 and 32 bpp images.  If there is
 *          a colormap, it is removed by converting to RGB.  In other
 *          cases, we issue a warning and call pixScaleGeneral().
 *      (3) This is faster than pixScale() because it does not do sharpening.
 *      (4) It does a relatively expensive area mapping computation, to
 *          avoid antialiasing.  It is about 2x slower than pixScaleSmooth(),
 *          but the results are much better on fine text.
 *      (5) This is typically about 20% faster for the special cases of
 *          2x, 4x, 8x and 16x reduction.
 *      (6) Surprisingly, there is no speedup (and a slight quality
 *          impairment) if you do as many successive 2x reductions as
 *          possible, ending with a reduction with a scale factor larger
 *          than 0.5.
 * </pre>
 */
PIX *
pixScaleAreaMap(PIX       *pix,
                l_float32  scalex,
                l_float32  scaley)
{
l_int32    ws, hs, d, wd, hd, wpls, wpld;
l_uint32  *datas, *datad;
l_float32  maxscale;
PIX       *pixs, *pixd, *pixt1, *pixt2, *pixt3;

    PROCNAME("pixScaleAreaMap");

    if (!pix)
        return (PIX *)ERROR_PTR("pix not defined", procName, NULL);
    d = pixGetDepth(pix);
    if (d != 2 && d != 4 && d != 8 && d != 32)
        return (PIX *)ERROR_PTR("pix not 2, 4, 8 or 32 bpp", procName, NULL);
    maxscale = L_MAX(scalex, scaley);
    if (maxscale >= 0.7) {
        L_WARNING("scaling factors not < 0.7; do regular scaling\n", procName);
        return pixScaleGeneral(pix, scalex, scaley, 0.0, 0);
    }

        /* Special cases: 2x, 4x, 8x, 16x reduction */
    if (scalex == 0.5 && scaley == 0.5)
        return pixScaleAreaMap2(pix);
    if (scalex == 0.25 && scaley == 0.25) {
        pixt1 = pixScaleAreaMap2(pix);
        pixd = pixScaleAreaMap2(pixt1);
        pixDestroy(&pixt1);
        return pixd;
    }
    if (scalex == 0.125 && scaley == 0.125) {
        pixt1 = pixScaleAreaMap2(pix);
        pixt2 = pixScaleAreaMap2(pixt1);
        pixd = pixScaleAreaMap2(pixt2);
        pixDestroy(&pixt1);
        pixDestroy(&pixt2);
        return pixd;
    }
    if (scalex == 0.0625 && scaley == 0.0625) {
        pixt1 = pixScaleAreaMap2(pix);
        pixt2 = pixScaleAreaMap2(pixt1);
        pixt3 = pixScaleAreaMap2(pixt2);
        pixd = pixScaleAreaMap2(pixt3);
        pixDestroy(&pixt1);
        pixDestroy(&pixt2);
        pixDestroy(&pixt3);
        return pixd;
    }

        /* Remove colormap if necessary.
         * If 2 bpp or 4 bpp gray, convert to 8 bpp */
    if ((d == 2 || d == 4 || d == 8) && pixGetColormap(pix)) {
        L_WARNING("pix has colormap; removing\n", procName);
        pixs = pixRemoveColormap(pix, REMOVE_CMAP_BASED_ON_SRC);
        d = pixGetDepth(pixs);
    } else if (d == 2 || d == 4) {
        pixs = pixConvertTo8(pix, FALSE);
        d = 8;
    } else {
        pixs = pixClone(pix);
    }

    pixGetDimensions(pixs, &ws, &hs, NULL);
    datas = pixGetData(pixs);
    wpls = pixGetWpl(pixs);
    wd = (l_int32)(scalex * (l_float32)ws + 0.5);
    hd = (l_int32)(scaley * (l_float32)hs + 0.5);
    if (wd < 1 || hd < 1) {
        pixDestroy(&pixs);
        return (PIX *)ERROR_PTR("pixd too small", procName, NULL);
    }
    if ((pixd = pixCreate(wd, hd, d)) == NULL) {
        pixDestroy(&pixs);
        return (PIX *)ERROR_PTR("pixd not made", procName, NULL);
    }
    pixCopyInputFormat(pixd, pixs);
    pixCopyResolution(pixd, pixs);
    pixScaleResolution(pixd, scalex, scaley);
    datad = pixGetData(pixd);
    wpld = pixGetWpl(pixd);
    if (d == 8) {
        scaleGrayAreaMapLow(datad, wd, hd, wpld, datas, ws, hs, wpls);
    } else {  /* RGB, d == 32 */
        scaleColorAreaMapLow(datad, wd, hd, wpld, datas, ws, hs, wpls);
        if (pixGetSpp(pixs) == 4)
            pixScaleAndTransferAlpha(pixd, pixs, scalex, scaley);
    }

    pixDestroy(&pixs);
    return pixd;
}


/*!
 * \brief   pixScaleAreaMap2()
 *
 * \param[in]    pix 2, 4, 8 or 32 bpp; and 2, 4, 8 bpp with colormap
 * \return  pixd, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This function does an area mapping (average) for 2x
 *          reduction.
 *      (2) This works only on 2, 4, 8 and 32 bpp images.  If there is
 *          a colormap, it is removed by converting to RGB.
 *      (3) Speed on 3 GHz processor:
 *             Color: 160 Mpix/sec
 *             Gray: 700 Mpix/sec
 *          This contrasts with the speed of the general pixScaleAreaMap():
 *             Color: 35 Mpix/sec
 *             Gray: 50 Mpix/sec
 *      (4) From (3), we see that this special function is about 4.5x
 *          faster for color and 14x faster for grayscale
 *      (5) Consequently, pixScaleAreaMap2() is incorporated into the
 *          general area map scaling function, for the special cases
 *          of 2x, 4x, 8x and 16x reduction.
 * </pre>
 */
PIX *
pixScaleAreaMap2(PIX  *pix)
{
l_int32    wd, hd, d, wpls, wpld;
l_uint32  *datas, *datad;
PIX       *pixs, *pixd;

    PROCNAME("pixScaleAreaMap2");

    if (!pix)
        return (PIX *)ERROR_PTR("pix not defined", procName, NULL);
    d = pixGetDepth(pix);
    if (d != 2 && d != 4 && d != 8 && d != 32)
        return (PIX *)ERROR_PTR("pix not 2, 4, 8 or 32 bpp", procName, NULL);

        /* Remove colormap if necessary.
         * If 2 bpp or 4 bpp gray, convert to 8 bpp */
    if ((d == 2 || d == 4 || d == 8) && pixGetColormap(pix)) {
        L_WARNING("pix has colormap; removing\n", procName);
        pixs = pixRemoveColormap(pix, REMOVE_CMAP_BASED_ON_SRC);
        d = pixGetDepth(pixs);
    } else if (d == 2 || d == 4) {
        pixs = pixConvertTo8(pix, FALSE);
        d = 8;
    } else {
        pixs = pixClone(pix);
    }

    wd = pixGetWidth(pixs) / 2;
    hd = pixGetHeight(pixs) / 2;
    datas = pixGetData(pixs);
    wpls = pixGetWpl(pixs);
    pixd = pixCreate(wd, hd, d);
    datad = pixGetData(pixd);
    wpld = pixGetWpl(pixd);
    pixCopyInputFormat(pixd, pixs);
    pixCopyResolution(pixd, pixs);
    pixScaleResolution(pixd, 0.5, 0.5);
    scaleAreaMapLow2(datad, wd, hd, wpld, datas, d, wpls);
    if (pixGetSpp(pixs) == 4)
        pixScaleAndTransferAlpha(pixd, pixs, 0.5, 0.5);
    pixDestroy(&pixs);
    return pixd;
}


/*!
 * \brief   pixScaleAreaMapToSize()
 *
 * \param[in]    pix 2, 4, 8 or 32 bpp; and 2, 4, 8 bpp with colormap
 * \param[in]    wd  target width; use 0 if using height as target
 * \param[in]    hd  target height; use 0 if using width as target
 * \return  pixd, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) See notes in pixScaleAreaMap().
 *      (2) The output scaled image has the dimension(s) you specify:
 *          * To specify the width with isotropic scaling, set %hd = 0.
 *          * To specify the height with isotropic scaling, set %wd = 0.
 *          * If both %wd and %hd are specified, the image is scaled
 *             (in general, anisotropically) to that size.
 *          * It is an error to set both %wd and %hd to 0.
 * </pre>
 */
PIX *
pixScaleAreaMapToSize(PIX     *pixs,
                      l_int32  wd,
                      l_int32  hd)
{
l_int32    w, h;
l_float32  scalex, scaley;

    PROCNAME("pixScaleAreaMapToSize");

    if (!pixs)
        return (PIX *)ERROR_PTR("pixs not defined", procName, NULL);
    if (wd <= 0 && hd <= 0)
        return (PIX *)ERROR_PTR("neither wd nor hd > 0", procName, NULL);

    pixGetDimensions(pixs, &w, &h, NULL);
    if (wd <= 0) {
        scaley = (l_float32)hd / (l_float32)h;
        scalex = scaley;
    } else if (hd <= 0) {
        scalex = (l_float32)wd / (l_float32)w;
        scaley = scalex;
    } else {
        scalex = (l_float32)wd / (l_float32)w;
        scaley = (l_float32)hd / (l_float32)h;
    }

    return pixScaleAreaMap(pixs, scalex, scaley);
}


/*------------------------------------------------------------------*
 *               Binary scaling by closest pixel sampling           *
 *------------------------------------------------------------------*/
/*!
 * \brief   pixScaleBinary()
 *
 * \param[in]    pixs 1 bpp
 * \param[in]    scalex, scaley both > 0.0
 * \return  pixd, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This function samples from the source without
 *          filtering.  As a result, aliasing will occur for
 *          subsampling (scalex and scaley < 1.0).
 * </pre>
 */
PIX *
pixScaleBinary(PIX       *pixs,
               l_float32  scalex,
               l_float32  scaley)
{
l_int32    ws, hs, wpls, wd, hd, wpld;
l_uint32  *datas, *datad;
PIX       *pixd;

    PROCNAME("pixScaleBinary");

    if (!pixs)
        return (PIX *)ERROR_PTR("pixs not defined", procName, NULL);
    if (pixGetDepth(pixs) != 1)
        return (PIX *)ERROR_PTR("pixs must be 1 bpp", procName, NULL);
    if (scalex <= 0.0 || scaley <= 0.0)
        return (PIX *)ERROR_PTR("scale factor <= 0", procName, NULL);
    if (scalex == 1.0 && scaley == 1.0)
        return pixCopy(NULL, pixs);

    pixGetDimensions(pixs, &ws, &hs, NULL);
    datas = pixGetData(pixs);
    wpls = pixGetWpl(pixs);
    wd = (l_int32)(scalex * (l_float32)ws + 0.5);
    hd = (l_int32)(scaley * (l_float32)hs + 0.5);
    if ((pixd = pixCreate(wd, hd, 1)) == NULL)
        return (PIX *)ERROR_PTR("pixd not made", procName, NULL);
    pixCopyColormap(pixd, pixs);
    pixCopyText(pixd, pixs);
    pixCopyInputFormat(pixd, pixs);
    pixCopyResolution(pixd, pixs);
    pixScaleResolution(pixd, scalex, scaley);
    datad = pixGetData(pixd);
    wpld = pixGetWpl(pixd);
    scaleBinaryLow(datad, wd, hd, wpld, datas, ws, hs, wpls);
    return pixd;
}


/* ================================================================ *
 *                    Low level static functions                    *
 * ================================================================ */

/*------------------------------------------------------------------*
 *            General linear interpolated color scaling             *
 *------------------------------------------------------------------*/
/*!
 * \brief   scaleColorLILow()
 *
 *  We choose to divide each pixel into 16 x 16 sub-pixels.
 *  Linear interpolation is equivalent to finding the
 *  fractional area (i.e., number of sub-pixels divided
 *  by 256) associated with each of the four nearest src pixels,
 *  and weighting each pixel value by this fractional area.
 *
 *  P3 speed is about 7 x 10^6 dst pixels/sec/GHz
 */
static void
scaleColorLILow(l_uint32  *datad,
               l_int32    wd,
               l_int32    hd,
               l_int32    wpld,
               l_uint32  *datas,
               l_int32    ws,
               l_int32    hs,
               l_int32    wpls)
{
l_int32    i, j, wm2, hm2;
l_int32    xpm, ypm;  /* location in src image, to 1/16 of a pixel */
l_int32    xp, yp, xf, yf;  /* src pixel and pixel fraction coordinates */
l_int32    v00r, v01r, v10r, v11r, v00g, v01g, v10g, v11g;
l_int32    v00b, v01b, v10b, v11b, area00, area01, area10, area11;
l_uint32   pixels1, pixels2, pixels3, pixels4, pixel;
l_uint32  *lines, *lined;
l_float32  scx, scy;

        /* (scx, scy) are scaling factors that are applied to the
         * dest coords to get the corresponding src coords.
         * We need them because we iterate over dest pixels
         * and must find the corresponding set of src pixels. */
    scx = 16. * (l_float32)ws / (l_float32)wd;
    scy = 16. * (l_float32)hs / (l_float32)hd;
    wm2 = ws - 2;
    hm2 = hs - 2;

        /* Iterate over the destination pixels */
    for (i = 0; i < hd; i++) {
        ypm = (l_int32)(scy * (l_float32)i);
        yp = ypm >> 4;
        yf = ypm & 0x0f;
        lined = datad + i * wpld;
        lines = datas + yp * wpls;
        for (j = 0; j < wd; j++) {
            xpm = (l_int32)(scx * (l_float32)j);
            xp = xpm >> 4;
            xf = xpm & 0x0f;

                /* Do bilinear interpolation.  This is a simple
                 * generalization of the calculation in scaleGrayLILow().
                 * Without this, we could simply subsample:
                 *     *(lined + j) = *(lines + xp);
                 * which is faster but gives lousy results!  */
            pixels1 = *(lines + xp);

            if (xp > wm2 || yp > hm2) {
                if (yp > hm2 && xp <= wm2) {  /* pixels near bottom */
                    pixels2 = *(lines + xp + 1);
                    pixels3 = pixels1;
                    pixels4 = pixels2;
                } else if (xp > wm2 && yp <= hm2) {  /* pixels near rt side */
                    pixels2 = pixels1;
                    pixels3 = *(lines + wpls + xp);
                    pixels4 = pixels3;
                } else {  /* pixels at LR corner */
                    pixels4 = pixels3 = pixels2 = pixels1;
                }
            } else {
                pixels2 = *(lines + xp + 1);
                pixels3 = *(lines + wpls + xp);
                pixels4 = *(lines + wpls + xp + 1);
            }

            area00 = (16 - xf) * (16 - yf);
            area10 = xf * (16 - yf);
            area01 = (16 - xf) * yf;
            area11 = xf * yf;
            v00r = area00 * ((pixels1 >> L_RED_SHIFT) & 0xff);
            v00g = area00 * ((pixels1 >> L_GREEN_SHIFT) & 0xff);
            v00b = area00 * ((pixels1 >> L_BLUE_SHIFT) & 0xff);
            v10r = area10 * ((pixels2 >> L_RED_SHIFT) & 0xff);
            v10g = area10 * ((pixels2 >> L_GREEN_SHIFT) & 0xff);
            v10b = area10 * ((pixels2 >> L_BLUE_SHIFT) & 0xff);
            v01r = area01 * ((pixels3 >> L_RED_SHIFT) & 0xff);
            v01g = area01 * ((pixels3 >> L_GREEN_SHIFT) & 0xff);
            v01b = area01 * ((pixels3 >> L_BLUE_SHIFT) & 0xff);
            v11r = area11 * ((pixels4 >> L_RED_SHIFT) & 0xff);
            v11g = area11 * ((pixels4 >> L_GREEN_SHIFT) & 0xff);
            v11b = area11 * ((pixels4 >> L_BLUE_SHIFT) & 0xff);
            pixel = (((v00r + v10r + v01r + v11r + 128) << 16) & 0xff000000) |
                    (((v00g + v10g + v01g + v11g + 128) << 8) & 0x00ff0000) |
                    ((v00b + v10b + v01b + v11b + 128) & 0x0000ff00);
            *(lined + j) = pixel;
        }
    }
}


/*------------------------------------------------------------------*
 *            General linear interpolated gray scaling              *
 *------------------------------------------------------------------*/
/*!
 * \brief   scaleGrayLILow()
 *
 *  We choose to divide each pixel into 16 x 16 sub-pixels.
 *  Linear interpolation is equivalent to finding the
 *  fractional area (i.e., number of sub-pixels divided
 *  by 256) associated with each of the four nearest src pixels,
 *  and weighting each pixel value by this fractional area.
 */
static void
scaleGrayLILow(l_uint32  *datad,
               l_int32    wd,
               l_int32    hd,
               l_int32    wpld,
               l_uint32  *datas,
               l_int32    ws,
               l_int32    hs,
               l_int32    wpls)
{
l_int32    i, j, wm2, hm2;
l_int32    xpm, ypm;  /* location in src image, to 1/16 of a pixel */
l_int32    xp, yp, xf, yf;  /* src pixel and pixel fraction coordinates */
l_int32    v00, v01, v10, v11, v00_val, v01_val, v10_val, v11_val;
l_uint8    val;
l_uint32  *lines, *lined;
l_float32  scx, scy;

        /* (scx, scy) are scaling factors that are applied to the
         * dest coords to get the corresponding src coords.
         * We need them because we iterate over dest pixels
         * and must find the corresponding set of src pixels. */
    scx = 16. * (l_float32)ws / (l_float32)wd;
    scy = 16. * (l_float32)hs / (l_float32)hd;
    wm2 = ws - 2;
    hm2 = hs - 2;

        /* Iterate over the destination pixels */
    for (i = 0; i < hd; i++) {
        ypm = (l_int32)(scy * (l_float32)i);
        yp = ypm >> 4;
        yf = ypm & 0x0f;
        lined = datad + i * wpld;
        lines = datas + yp * wpls;
        for (j = 0; j < wd; j++) {
            xpm = (l_int32)(scx * (l_float32)j);
            xp = xpm >> 4;
            xf = xpm & 0x0f;

                /* Do bilinear interpolation.  Without this, we could
                 * simply subsample:
                 *   SET_DATA_BYTE(lined, j, GET_DATA_BYTE(lines, xp));
                 * which is faster but gives lousy results!  */
            v00_val = GET_DATA_BYTE(lines, xp);
            if (xp > wm2 || yp > hm2) {
                if (yp > hm2 && xp <= wm2) {  /* pixels near bottom */
                    v01_val = v00_val;
                    v10_val = GET_DATA_BYTE(lines, xp + 1);
                    v11_val = v10_val;
                } else if (xp > wm2 && yp <= hm2) {  /* pixels near rt side */
                    v01_val = GET_DATA_BYTE(lines + wpls, xp);
                    v10_val = v00_val;
                    v11_val = v01_val;
                } else {  /* pixels at LR corner */
                    v10_val = v01_val = v11_val = v00_val;
                }
            } else {
                v10_val = GET_DATA_BYTE(lines, xp + 1);
                v01_val = GET_DATA_BYTE(lines + wpls, xp);
                v11_val = GET_DATA_BYTE(lines + wpls, xp + 1);
            }

            v00 = (16 - xf) * (16 - yf) * v00_val;
            v10 = xf * (16 - yf) * v10_val;
            v01 = (16 - xf) * yf * v01_val;
            v11 = xf * yf * v11_val;

            val = (l_uint8)((v00 + v01 + v10 + v11 + 128) / 256);
            SET_DATA_BYTE(lined, j, val);
        }
    }
}


/*------------------------------------------------------------------*
 *                2x linear interpolated color scaling              *
 *------------------------------------------------------------------*/
/*!
 * \brief   scaleColor2xLILow()
 *
 *  This is a special case of 2x expansion by linear
 *  interpolation.  Each src pixel contains 4 dest pixels.
 *  The 4 dest pixels in src pixel 1 are numbered at
 *  their UL corners.  The 4 dest pixels in src pixel 1
 *  are related to that src pixel and its 3 neighboring
 *  src pixels as follows:
 *
 *             1-----2-----|-----|-----|
 *             |     |     |     |     |
 *             |     |     |     |     |
 *  src 1 -->  3-----4-----|     |     |  <-- src 2
 *             |     |     |     |     |
 *             |     |     |     |     |
 *             |-----|-----|-----|-----|
 *             |     |     |     |     |
 *             |     |     |     |     |
 *  src 3 -->  |     |     |     |     |  <-- src 4
 *             |     |     |     |     |
 *             |     |     |     |     |
 *             |-----|-----|-----|-----|
 *
 *           dest      src
 *           ----      ---
 *           dp1    =  sp1
 *           dp2    =  (sp1 + sp2) / 2
 *           dp3    =  (sp1 + sp3) / 2
 *           dp4    =  (sp1 + sp2 + sp3 + sp4) / 4
 *
 *  We iterate over the src pixels, and unroll the calculation
 *  for each set of 4 dest pixels corresponding to that src
 *  pixel, caching pixels for the next src pixel whenever possible.
 *  The method is exactly analogous to the one we use for
 *  scaleGray2xLILow() and its line version.
 *
 *  P3 speed is about 5 x 10^7 dst pixels/sec/GHz
 */
static void
scaleColor2xLILow(l_uint32  *datad,
                  l_int32    wpld,
                  l_uint32  *datas,
                  l_int32    ws,
                  l_int32    hs,
                  l_int32    wpls)
{
l_int32    i, hsm;
l_uint32  *lines, *lined;

    hsm = hs - 1;

        /* We're taking 2 src and 2 dest lines at a time,
         * and for each src line, we're computing 2 dest lines.
         * Call these 2 dest lines:  destline1 and destline2.
         * The first src line is used for destline 1.
         * On all but the last src line, both src lines are
         * used in the linear interpolation for destline2.
         * On the last src line, both destline1 and destline2
         * are computed using only that src line (because there
         * isn't a lower src line). */

        /* iterate over all but the last src line */
    for (i = 0; i < hsm; i++) {
        lines = datas + i * wpls;
        lined = datad + 2 * i * wpld;
        scaleColor2xLILineLow(lined, wpld, lines, ws, wpls, 0);
    }

        /* last src line */
    lines = datas + hsm * wpls;
    lined = datad + 2 * hsm * wpld;
    scaleColor2xLILineLow(lined, wpld, lines, ws, wpls, 1);
}


/*!
 * \brief   scaleColor2xLILineLow()
 *
 * \param[in]    lined   ptr to top destline, to be made from current src line
 * \param[in]    wpld
 * \param[in]    lines   ptr to current src line
 * \param[in]    ws
 * \param[in]    wpls
 * \param[in]    lastlineflag  1 if last src line; 0 otherwise
 * \return  void
 */
static void
scaleColor2xLILineLow(l_uint32  *lined,
                      l_int32    wpld,
                      l_uint32  *lines,
                      l_int32    ws,
                      l_int32    wpls,
                      l_int32    lastlineflag)
{
l_int32    j, jd, wsm;
l_int32    rval1, rval2, rval3, rval4, gval1, gval2, gval3, gval4;
l_int32    bval1, bval2, bval3, bval4;
l_uint32   pixels1, pixels2, pixels3, pixels4, pixel;
l_uint32  *linesp, *linedp;

    wsm = ws - 1;

    if (lastlineflag == 0) {
        linesp = lines + wpls;
        linedp = lined + wpld;
        pixels1 = *lines;
        pixels3 = *linesp;

            /* initialize with v(2) and v(4) */
        rval2 = pixels1 >> 24;
        gval2 = (pixels1 >> 16) & 0xff;
        bval2 = (pixels1 >> 8) & 0xff;
        rval4 = pixels3 >> 24;
        gval4 = (pixels3 >> 16) & 0xff;
        bval4 = (pixels3 >> 8) & 0xff;

        for (j = 0, jd = 0; j < wsm; j++, jd += 2) {
                /* shift in previous src values */
            rval1 = rval2;
            gval1 = gval2;
            bval1 = bval2;
            rval3 = rval4;
            gval3 = gval4;
            bval3 = bval4;
                /* get new src values */
            pixels2 = *(lines + j + 1);
            pixels4 = *(linesp + j + 1);
            rval2 = pixels2 >> 24;
            gval2 = (pixels2 >> 16) & 0xff;
            bval2 = (pixels2 >> 8) & 0xff;
            rval4 = pixels4 >> 24;
            gval4 = (pixels4 >> 16) & 0xff;
            bval4 = (pixels4 >> 8) & 0xff;
                /* save dest values */
            pixel = (rval1 << 24 | gval1 << 16 | bval1 << 8);
            *(lined + jd) = pixel;                               /* pix 1 */
            pixel = ((((rval1 + rval2) << 23) & 0xff000000) |
                     (((gval1 + gval2) << 15) & 0x00ff0000) |
                     (((bval1 + bval2) << 7) & 0x0000ff00));
            *(lined + jd + 1) = pixel;                           /* pix 2 */
            pixel = ((((rval1 + rval3) << 23) & 0xff000000) |
                     (((gval1 + gval3) << 15) & 0x00ff0000) |
                     (((bval1 + bval3) << 7) & 0x0000ff00));
            *(linedp + jd) = pixel;                              /* pix 3 */
            pixel = ((((rval1 + rval2 + rval3 + rval4) << 22) & 0xff000000) |
                     (((gval1 + gval2 + gval3 + gval4) << 14) & 0x00ff0000) |
                     (((bval1 + bval2 + bval3 + bval4) << 6) & 0x0000ff00));
            *(linedp + jd + 1) = pixel;                          /* pix 4 */
        }
            /* last src pixel on line */
        rval1 = rval2;
        gval1 = gval2;
        bval1 = bval2;
        rval3 = rval4;
        gval3 = gval4;
        bval3 = bval4;
        pixel = (rval1 << 24 | gval1 << 16 | bval1 << 8);
        *(lined + 2 * wsm) = pixel;                        /* pix 1 */
        *(lined + 2 * wsm + 1) = pixel;                    /* pix 2 */
        pixel = ((((rval1 + rval3) << 23) & 0xff000000) |
                 (((gval1 + gval3) << 15) & 0x00ff0000) |
                 (((bval1 + bval3) << 7) & 0x0000ff00));
        *(linedp + 2 * wsm) = pixel;                       /* pix 3 */
        *(linedp + 2 * wsm + 1) = pixel;                   /* pix 4 */
    } else {   /* last row of src pixels: lastlineflag == 1 */
        linedp = lined + wpld;
        pixels2 = *lines;
        rval2 = pixels2 >> 24;
        gval2 = (pixels2 >> 16) & 0xff;
        bval2 = (pixels2 >> 8) & 0xff;
        for (j = 0, jd = 0; j < wsm; j++, jd += 2) {
            rval1 = rval2;
            gval1 = gval2;
            bval1 = bval2;
            pixels2 = *(lines + j + 1);
            rval2 = pixels2 >> 24;
            gval2 = (pixels2 >> 16) & 0xff;
            bval2 = (pixels2 >> 8) & 0xff;
            pixel = (rval1 << 24 | gval1 << 16 | bval1 << 8);
            *(lined + jd) = pixel;                            /* pix 1 */
            *(linedp + jd) = pixel;                           /* pix 2 */
            pixel = ((((rval1 + rval2) << 23) & 0xff000000) |
                     (((gval1 + gval2) << 15) & 0x00ff0000) |
                     (((bval1 + bval2) << 7) & 0x0000ff00));
            *(lined + jd + 1) = pixel;                        /* pix 3 */
            *(linedp + jd + 1) = pixel;                       /* pix 4 */
        }
        rval1 = rval2;
        gval1 = gval2;
        bval1 = bval2;
        pixel = (rval1 << 24 | gval1 << 16 | bval1 << 8);
        *(lined + 2 * wsm) = pixel;                           /* pix 1 */
        *(lined + 2 * wsm + 1) = pixel;                       /* pix 2 */
        *(linedp + 2 * wsm) = pixel;                          /* pix 3 */
        *(linedp + 2 * wsm + 1) = pixel;                      /* pix 4 */
    }
}


/*------------------------------------------------------------------*
 *                2x linear interpolated gray scaling               *
 *------------------------------------------------------------------*/
/*!
 * \brief   scaleGray2xLILow()
 *
 *  This is a special case of 2x expansion by linear
 *  interpolation.  Each src pixel contains 4 dest pixels.
 *  The 4 dest pixels in src pixel 1 are numbered at
 *  their UL corners.  The 4 dest pixels in src pixel 1
 *  are related to that src pixel and its 3 neighboring
 *  src pixels as follows:
 *
 *             1-----2-----|-----|-----|
 *             |     |     |     |     |
 *             |     |     |     |     |
 *  src 1 -->  3-----4-----|     |     |  <-- src 2
 *             |     |     |     |     |
 *             |     |     |     |     |
 *             |-----|-----|-----|-----|
 *             |     |     |     |     |
 *             |     |     |     |     |
 *  src 3 -->  |     |     |     |     |  <-- src 4
 *             |     |     |     |     |
 *             |     |     |     |     |
 *             |-----|-----|-----|-----|
 *
 *           dest      src
 *           ----      ---
 *           dp1    =  sp1
 *           dp2    =  (sp1 + sp2) / 2
 *           dp3    =  (sp1 + sp3) / 2
 *           dp4    =  (sp1 + sp2 + sp3 + sp4) / 4
 *
 *  We iterate over the src pixels, and unroll the calculation
 *  for each set of 4 dest pixels corresponding to that src
 *  pixel, caching pixels for the next src pixel whenever possible.
 */
static void
scaleGray2xLILow(l_uint32  *datad,
                 l_int32    wpld,
                 l_uint32  *datas,
                 l_int32    ws,
                 l_int32    hs,
                 l_int32    wpls)
{
l_int32    i, hsm;
l_uint32  *lines, *lined;

    hsm = hs - 1;

        /* We're taking 2 src and 2 dest lines at a time,
         * and for each src line, we're computing 2 dest lines.
         * Call these 2 dest lines:  destline1 and destline2.
         * The first src line is used for destline 1.
         * On all but the last src line, both src lines are
         * used in the linear interpolation for destline2.
         * On the last src line, both destline1 and destline2
         * are computed using only that src line (because there
         * isn't a lower src line). */

        /* iterate over all but the last src line */
    for (i = 0; i < hsm; i++) {
        lines = datas + i * wpls;
        lined = datad + 2 * i * wpld;
        scaleGray2xLILineLow(lined, wpld, lines, ws, wpls, 0);
    }

        /* last src line */
    lines = datas + hsm * wpls;
    lined = datad + 2 * hsm * wpld;
    scaleGray2xLILineLow(lined, wpld, lines, ws, wpls, 1);
}


/*!
 * \brief   scaleGray2xLILineLow()
 *
 * \param[in]    lined   ptr to top destline, to be made from current src line
 * \param[in]    wpld
 * \param[in]    lines   ptr to current src line
 * \param[in]    ws
 * \param[in]    wpls
 * \param[in]    lastlineflag  1 if last src line; 0 otherwise
 * \return  void
 */
static void
scaleGray2xLILineLow(l_uint32  *lined,
                     l_int32    wpld,
                     l_uint32  *lines,
                     l_int32    ws,
                     l_int32    wpls,
                     l_int32    lastlineflag)
{
l_int32    j, jd, wsm, w;
l_int32    sval1, sval2, sval3, sval4;
l_uint32  *linesp, *linedp;
l_uint32   words, wordsp, wordd, worddp;

    wsm = ws - 1;

    if (lastlineflag == 0) {
        linesp = lines + wpls;
        linedp = lined + wpld;

            /* Unroll the loop 4x and work on full words */
        words = lines[0];
        wordsp = linesp[0];
        sval2 = (words >> 24) & 0xff;
        sval4 = (wordsp >> 24) & 0xff;
        for (j = 0, jd = 0, w = 0; j + 3 < wsm; j += 4, jd += 8, w++) {
                /* At the top of the loop,
                 * words == lines[w], wordsp == linesp[w]
                 * and the top bytes of those have been loaded into
                 * sval2 and sval4. */
            sval1 = sval2;
            sval2 = (words >> 16) & 0xff;
            sval3 = sval4;
            sval4 = (wordsp >> 16) & 0xff;
            wordd = (sval1 << 24) | (((sval1 + sval2) >> 1) << 16);
            worddp = (((sval1 + sval3) >> 1) << 24) |
                (((sval1 + sval2 + sval3 + sval4) >> 2) << 16);

            sval1 = sval2;
            sval2 = (words >> 8) & 0xff;
            sval3 = sval4;
            sval4 = (wordsp >> 8) & 0xff;
            wordd |= (sval1 << 8) | ((sval1 + sval2) >> 1);
            worddp |= (((sval1 + sval3) >> 1) << 8) |
                ((sval1 + sval2 + sval3 + sval4) >> 2);
            lined[w * 2] = wordd;
            linedp[w * 2] = worddp;

            sval1 = sval2;
            sval2 = words & 0xff;
            sval3 = sval4;
            sval4 = wordsp & 0xff;
            wordd = (sval1 << 24) |                              /* pix 1 */
                (((sval1 + sval2) >> 1) << 16);                  /* pix 2 */
            worddp = (((sval1 + sval3) >> 1) << 24) |            /* pix 3 */
                (((sval1 + sval2 + sval3 + sval4) >> 2) << 16);  /* pix 4 */

                /* Load the next word as we need its first byte */
            words = lines[w + 1];
            wordsp = linesp[w + 1];
            sval1 = sval2;
            sval2 = (words >> 24) & 0xff;
            sval3 = sval4;
            sval4 = (wordsp >> 24) & 0xff;
            wordd |= (sval1 << 8) |                              /* pix 1 */
                ((sval1 + sval2) >> 1);                          /* pix 2 */
            worddp |= (((sval1 + sval3) >> 1) << 8) |            /* pix 3 */
                ((sval1 + sval2 + sval3 + sval4) >> 2);          /* pix 4 */
            lined[w * 2 + 1] = wordd;
            linedp[w * 2 + 1] = worddp;
        }

            /* Finish up the last word */
        for (; j < wsm; j++, jd += 2) {
            sval1 = sval2;
            sval3 = sval4;
            sval2 = GET_DATA_BYTE(lines, j + 1);
            sval4 = GET_DATA_BYTE(linesp, j + 1);
            SET_DATA_BYTE(lined, jd, sval1);                     /* pix 1 */
            SET_DATA_BYTE(lined, jd + 1, (sval1 + sval2) / 2);   /* pix 2 */
            SET_DATA_BYTE(linedp, jd, (sval1 + sval3) / 2);      /* pix 3 */
            SET_DATA_BYTE(linedp, jd + 1,
                          (sval1 + sval2 + sval3 + sval4) / 4);  /* pix 4 */
        }
        sval1 = sval2;
        sval3 = sval4;
        SET_DATA_BYTE(lined, 2 * wsm, sval1);                     /* pix 1 */
        SET_DATA_BYTE(lined, 2 * wsm + 1, sval1);                 /* pix 2 */
        SET_DATA_BYTE(linedp, 2 * wsm, (sval1 + sval3) / 2);      /* pix 3 */
        SET_DATA_BYTE(linedp, 2 * wsm + 1, (sval1 + sval3) / 2);  /* pix 4 */

#if DEBUG_UNROLLING
#define CHECK_BYTE(a, b, c) if (GET_DATA_BYTE(a, b) != c) {\
     fprintf(stderr, "Error: mismatch at %d, %d vs %d\n", \
             j, GET_DATA_BYTE(a, b), c); }

        sval2 = GET_DATA_BYTE(lines, 0);
        sval4 = GET_DATA_BYTE(linesp, 0);
        for (j = 0, jd = 0; j < wsm; j++, jd += 2) {
            sval1 = sval2;
            sval3 = sval4;
            sval2 = GET_DATA_BYTE(lines, j + 1);
            sval4 = GET_DATA_BYTE(linesp, j + 1);
            CHECK_BYTE(lined, jd, sval1);                     /* pix 1 */
            CHECK_BYTE(lined, jd + 1, (sval1 + sval2) / 2);   /* pix 2 */
            CHECK_BYTE(linedp, jd, (sval1 + sval3) / 2);      /* pix 3 */
            CHECK_BYTE(linedp, jd + 1,
                          (sval1 + sval2 + sval3 + sval4) / 4);  /* pix 4 */
        }
        sval1 = sval2;
        sval3 = sval4;
        CHECK_BYTE(lined, 2 * wsm, sval1);                     /* pix 1 */
        CHECK_BYTE(lined, 2 * wsm + 1, sval1);                 /* pix 2 */
        CHECK_BYTE(linedp, 2 * wsm, (sval1 + sval3) / 2);      /* pix 3 */
        CHECK_BYTE(linedp, 2 * wsm + 1, (sval1 + sval3) / 2);  /* pix 4 */
#undef CHECK_BYTE
#endif
    } else {  /* last row of src pixels: lastlineflag == 1 */
        linedp = lined + wpld;
        sval2 = GET_DATA_BYTE(lines, 0);
        for (j = 0, jd = 0; j < wsm; j++, jd += 2) {
            sval1 = sval2;
            sval2 = GET_DATA_BYTE(lines, j + 1);
            SET_DATA_BYTE(lined, jd, sval1);                       /* pix 1 */
            SET_DATA_BYTE(linedp, jd, sval1);                      /* pix 3 */
            SET_DATA_BYTE(lined, jd + 1, (sval1 + sval2) / 2);     /* pix 2 */
            SET_DATA_BYTE(linedp, jd + 1, (sval1 + sval2) / 2);    /* pix 4 */
        }
        sval1 = sval2;
        SET_DATA_BYTE(lined, 2 * wsm, sval1);                     /* pix 1 */
        SET_DATA_BYTE(lined, 2 * wsm + 1, sval1);                 /* pix 2 */
        SET_DATA_BYTE(linedp, 2 * wsm, sval1);                    /* pix 3 */
        SET_DATA_BYTE(linedp, 2 * wsm + 1, sval1);                /* pix 4 */
    }
}


/*------------------------------------------------------------------*
 *               4x linear interpolated gray scaling                *
 *------------------------------------------------------------------*/
/*!
 * \brief   scaleGray4xLILow()
 *
 *  This is a special case of 4x expansion by linear
 *  interpolation.  Each src pixel contains 16 dest pixels.
 *  The 16 dest pixels in src pixel 1 are numbered at
 *  their UL corners.  The 16 dest pixels in src pixel 1
 *  are related to that src pixel and its 3 neighboring
 *  src pixels as follows:
 *
 *             1---2---3---4---|---|---|---|---|
 *             |   |   |   |   |   |   |   |   |
 *             5---6---7---8---|---|---|---|---|
 *             |   |   |   |   |   |   |   |   |
 *  src 1 -->  9---a---b---c---|---|---|---|---|  <-- src 2
 *             |   |   |   |   |   |   |   |   |
 *             d---e---f---g---|---|---|---|---|
 *             |   |   |   |   |   |   |   |   |
 *             |===|===|===|===|===|===|===|===|
 *             |   |   |   |   |   |   |   |   |
 *             |---|---|---|---|---|---|---|---|
 *             |   |   |   |   |   |   |   |   |
 *  src 3 -->  |---|---|---|---|---|---|---|---|  <-- src 4
 *             |   |   |   |   |   |   |   |   |
 *             |---|---|---|---|---|---|---|---|
 *             |   |   |   |   |   |   |   |   |
 *             |---|---|---|---|---|---|---|---|
 *
 *           dest      src
 *           ----      ---
 *           dp1    =  sp1
 *           dp2    =  (3 * sp1 + sp2) / 4
 *           dp3    =  (sp1 + sp2) / 2
 *           dp4    =  (sp1 + 3 * sp2) / 4
 *           dp5    =  (3 * sp1 + sp3) / 4
 *           dp6    =  (9 * sp1 + 3 * sp2 + 3 * sp3 + sp4) / 16
 *           dp7    =  (3 * sp1 + 3 * sp2 + sp3 + sp4) / 8
 *           dp8    =  (3 * sp1 + 9 * sp2 + 1 * sp3 + 3 * sp4) / 16
 *           dp9    =  (sp1 + sp3) / 2
 *           dp10   =  (3 * sp1 + sp2 + 3 * sp3 + sp4) / 8
 *           dp11   =  (sp1 + sp2 + sp3 + sp4) / 4
 *           dp12   =  (sp1 + 3 * sp2 + sp3 + 3 * sp4) / 8
 *           dp13   =  (sp1 + 3 * sp3) / 4
 *           dp14   =  (3 * sp1 + sp2 + 9 * sp3 + 3 * sp4) / 16
 *           dp15   =  (sp1 + sp2 + 3 * sp3 + 3 * sp4) / 8
 *           dp16   =  (sp1 + 3 * sp2 + 3 * sp3 + 9 * sp4) / 16
 *
 *  We iterate over the src pixels, and unroll the calculation
 *  for each set of 16 dest pixels corresponding to that src
 *  pixel, caching pixels for the next src pixel whenever possible.
 */
static void
scaleGray4xLILow(l_uint32  *datad,
                 l_int32    wpld,
                 l_uint32  *datas,
                 l_int32    ws,
                 l_int32    hs,
                 l_int32    wpls)
{
l_int32    i, hsm;
l_uint32  *lines, *lined;

    hsm = hs - 1;

        /* We're taking 2 src and 4 dest lines at a time,
         * and for each src line, we're computing 4 dest lines.
         * Call these 4 dest lines:  destline1 - destline4.
         * The first src line is used for destline 1.
         * Two src lines are used for all other dest lines,
         * except for the last 4 dest lines, which are computed
         * using only the last src line. */

        /* iterate over all but the last src line */
    for (i = 0; i < hsm; i++) {
        lines = datas + i * wpls;
        lined = datad + 4 * i * wpld;
        scaleGray4xLILineLow(lined, wpld, lines, ws, wpls, 0);
    }

        /* last src line */
    lines = datas + hsm * wpls;
    lined = datad + 4 * hsm * wpld;
    scaleGray4xLILineLow(lined, wpld, lines, ws, wpls, 1);
}


/*!
 * \brief   scaleGray4xLILineLow()
 *
 * \param[in]    lined   ptr to top destline, to be made from current src line
 * \param[in]    wpld
 * \param[in]    lines   ptr to current src line
 * \param[in]    ws
 * \param[in]    wpls
 * \param[in]    lastlineflag  1 if last src line; 0 otherwise
 * \return  void
 */
static void
scaleGray4xLILineLow(l_uint32  *lined,
                     l_int32    wpld,
                     l_uint32  *lines,
                     l_int32    ws,
                     l_int32    wpls,
                     l_int32    lastlineflag)
{
l_int32    j, jd, wsm, wsm4;
l_int32    s1, s2, s3, s4, s1t, s2t, s3t, s4t;
l_uint32  *linesp, *linedp1, *linedp2, *linedp3;

    wsm = ws - 1;
    wsm4 = 4 * wsm;

    if (lastlineflag == 0) {
        linesp = lines + wpls;
        linedp1 = lined + wpld;
        linedp2 = lined + 2 * wpld;
        linedp3 = lined + 3 * wpld;
        s2 = GET_DATA_BYTE(lines, 0);
        s4 = GET_DATA_BYTE(linesp, 0);
        for (j = 0, jd = 0; j < wsm; j++, jd += 4) {
            s1 = s2;
            s3 = s4;
            s2 = GET_DATA_BYTE(lines, j + 1);
            s4 = GET_DATA_BYTE(linesp, j + 1);
            s1t = 3 * s1;
            s2t = 3 * s2;
            s3t = 3 * s3;
            s4t = 3 * s4;
            SET_DATA_BYTE(lined, jd, s1);                             /* d1 */
            SET_DATA_BYTE(lined, jd + 1, (s1t + s2) / 4);             /* d2 */
            SET_DATA_BYTE(lined, jd + 2, (s1 + s2) / 2);              /* d3 */
            SET_DATA_BYTE(lined, jd + 3, (s1 + s2t) / 4);             /* d4 */
            SET_DATA_BYTE(linedp1, jd, (s1t + s3) / 4);                /* d5 */
            SET_DATA_BYTE(linedp1, jd + 1, (9*s1 + s2t + s3t + s4) / 16); /*d6*/
            SET_DATA_BYTE(linedp1, jd + 2, (s1t + s2t + s3 + s4) / 8); /* d7 */
            SET_DATA_BYTE(linedp1, jd + 3, (s1t + 9*s2 + s3 + s4t) / 16);/*d8*/
            SET_DATA_BYTE(linedp2, jd, (s1 + s3) / 2);                /* d9 */
            SET_DATA_BYTE(linedp2, jd + 1, (s1t + s2 + s3t + s4) / 8);/* d10 */
            SET_DATA_BYTE(linedp2, jd + 2, (s1 + s2 + s3 + s4) / 4);  /* d11 */
            SET_DATA_BYTE(linedp2, jd + 3, (s1 + s2t + s3 + s4t) / 8);/* d12 */
            SET_DATA_BYTE(linedp3, jd, (s1 + s3t) / 4);               /* d13 */
            SET_DATA_BYTE(linedp3, jd + 1, (s1t + s2 + 9*s3 + s4t) / 16);/*d14*/
            SET_DATA_BYTE(linedp3, jd + 2, (s1 + s2 + s3t + s4t) / 8); /* d15 */
            SET_DATA_BYTE(linedp3, jd + 3, (s1 + s2t + s3t + 9*s4) / 16);/*d16*/
        }
        s1 = s2;
        s3 = s4;
        s1t = 3 * s1;
        s3t = 3 * s3;
        SET_DATA_BYTE(lined, wsm4, s1);                               /* d1 */
        SET_DATA_BYTE(lined, wsm4 + 1, s1);                           /* d2 */
        SET_DATA_BYTE(lined, wsm4 + 2, s1);                           /* d3 */
        SET_DATA_BYTE(lined, wsm4 + 3, s1);                           /* d4 */
        SET_DATA_BYTE(linedp1, wsm4, (s1t + s3) / 4);                 /* d5 */
        SET_DATA_BYTE(linedp1, wsm4 + 1, (s1t + s3) / 4);             /* d6 */
        SET_DATA_BYTE(linedp1, wsm4 + 2, (s1t + s3) / 4);             /* d7 */
        SET_DATA_BYTE(linedp1, wsm4 + 3, (s1t + s3) / 4);             /* d8 */
        SET_DATA_BYTE(linedp2, wsm4, (s1 + s3) / 2);                  /* d9 */
        SET_DATA_BYTE(linedp2, wsm4 + 1, (s1 + s3) / 2);              /* d10 */
        SET_DATA_BYTE(linedp2, wsm4 + 2, (s1 + s3) / 2);              /* d11 */
        SET_DATA_BYTE(linedp2, wsm4 + 3, (s1 + s3) / 2);              /* d12 */
        SET_DATA_BYTE(linedp3, wsm4, (s1 + s3t) / 4);                 /* d13 */
        SET_DATA_BYTE(linedp3, wsm4 + 1, (s1 + s3t) / 4);             /* d14 */
        SET_DATA_BYTE(linedp3, wsm4 + 2, (s1 + s3t) / 4);             /* d15 */
        SET_DATA_BYTE(linedp3, wsm4 + 3, (s1 + s3t) / 4);             /* d16 */
    } else {   /* last row of src pixels: lastlineflag == 1 */
        linedp1 = lined + wpld;
        linedp2 = lined + 2 * wpld;
        linedp3 = lined + 3 * wpld;
        s2 = GET_DATA_BYTE(lines, 0);
        for (j = 0, jd = 0; j < wsm; j++, jd += 4) {
            s1 = s2;
            s2 = GET_DATA_BYTE(lines, j + 1);
            s1t = 3 * s1;
            s2t = 3 * s2;
            SET_DATA_BYTE(lined, jd, s1);                            /* d1 */
            SET_DATA_BYTE(lined, jd + 1, (s1t + s2) / 4 );           /* d2 */
            SET_DATA_BYTE(lined, jd + 2, (s1 + s2) / 2 );            /* d3 */
            SET_DATA_BYTE(lined, jd + 3, (s1 + s2t) / 4 );           /* d4 */
            SET_DATA_BYTE(linedp1, jd, s1);                          /* d5 */
            SET_DATA_BYTE(linedp1, jd + 1, (s1t + s2) / 4 );         /* d6 */
            SET_DATA_BYTE(linedp1, jd + 2, (s1 + s2) / 2 );          /* d7 */
            SET_DATA_BYTE(linedp1, jd + 3, (s1 + s2t) / 4 );         /* d8 */
            SET_DATA_BYTE(linedp2, jd, s1);                          /* d9 */
            SET_DATA_BYTE(linedp2, jd + 1, (s1t + s2) / 4 );         /* d10 */
            SET_DATA_BYTE(linedp2, jd + 2, (s1 + s2) / 2 );          /* d11 */
            SET_DATA_BYTE(linedp2, jd + 3, (s1 + s2t) / 4 );         /* d12 */
            SET_DATA_BYTE(linedp3, jd, s1);                          /* d13 */
            SET_DATA_BYTE(linedp3, jd + 1, (s1t + s2) / 4 );         /* d14 */
            SET_DATA_BYTE(linedp3, jd + 2, (s1 + s2) / 2 );          /* d15 */
            SET_DATA_BYTE(linedp3, jd + 3, (s1 + s2t) / 4 );         /* d16 */
        }
        s1 = s2;
        SET_DATA_BYTE(lined, wsm4, s1);                              /* d1 */
        SET_DATA_BYTE(lined, wsm4 + 1, s1);                          /* d2 */
        SET_DATA_BYTE(lined, wsm4 + 2, s1);                          /* d3 */
        SET_DATA_BYTE(lined, wsm4 + 3, s1);                          /* d4 */
        SET_DATA_BYTE(linedp1, wsm4, s1);                            /* d5 */
        SET_DATA_BYTE(linedp1, wsm4 + 1, s1);                        /* d6 */
        SET_DATA_BYTE(linedp1, wsm4 + 2, s1);                        /* d7 */
        SET_DATA_BYTE(linedp1, wsm4 + 3, s1);                        /* d8 */
        SET_DATA_BYTE(linedp2, wsm4, s1);                            /* d9 */
        SET_DATA_BYTE(linedp2, wsm4 + 1, s1);                        /* d10 */
        SET_DATA_BYTE(linedp2, wsm4 + 2, s1);                        /* d11 */
        SET_DATA_BYTE(linedp2, wsm4 + 3, s1);                        /* d12 */
        SET_DATA_BYTE(linedp3, wsm4, s1);                            /* d13 */
        SET_DATA_BYTE(linedp3, wsm4 + 1, s1);                        /* d14 */
        SET_DATA_BYTE(linedp3, wsm4 + 2, s1);                        /* d15 */
        SET_DATA_BYTE(linedp3, wsm4 + 3, s1);                        /* d16 */
    }
}


/*------------------------------------------------------------------*
 *       Grayscale and color scaling by closest pixel sampling      *
 *------------------------------------------------------------------*/
/*!
 * \brief   scaleBySamplingLow()
 *
 *  Notes:
 *      (1) The dest must be cleared prior to this operation,
 *          and we clear it here in the low-level code.
 *      (2) We reuse dest pixels and dest pixel rows whenever
 *          possible.  This speeds the upscaling; downscaling
 *          is done by strict subsampling and is unaffected.
 *      (3) Because we are sampling and not interpolating, this
 *          routine works directly, without conversion to full
 *          RGB color, for 2, 4 or 8 bpp palette color images.
 */
static l_int32
scaleBySamplingLow(l_uint32  *datad,
                   l_int32    wd,
                   l_int32    hd,
                   l_int32    wpld,
                   l_uint32  *datas,
                   l_int32    ws,
                   l_int32    hs,
                   l_int32    d,
                   l_int32    wpls)
{
l_int32    i, j, bpld;
l_int32    xs, prevxs, sval;
l_int32   *srow, *scol;
l_uint32   csval;
l_uint32  *lines, *prevlines, *lined, *prevlined;
l_float32  wratio, hratio;

    PROCNAME("scaleBySamplingLow");

    if (d != 2 && d != 4 && d !=8 && d != 16 && d != 32)
        return ERROR_INT("pixel depth not supported", procName, 1);

        /* clear dest */
    bpld = 4 * wpld;
    memset((char *)datad, 0, hd * bpld);

        /* the source row corresponding to dest row i ==> srow[i]
         * the source col corresponding to dest col j ==> scol[j]  */
    if ((srow = (l_int32 *)LEPT_CALLOC(hd, sizeof(l_int32))) == NULL)
        return ERROR_INT("srow not made", procName, 1);
    if ((scol = (l_int32 *)LEPT_CALLOC(wd, sizeof(l_int32))) == NULL) {
        LEPT_FREE(srow);
        return ERROR_INT("scol not made", procName, 1);
    }

    wratio = (l_float32)ws / (l_float32)wd;
    hratio = (l_float32)hs / (l_float32)hd;
    for (i = 0; i < hd; i++)
        srow[i] = L_MIN((l_int32)(hratio * i + 0.5), hs - 1);
    for (j = 0; j < wd; j++)
        scol[j] = L_MIN((l_int32)(wratio * j + 0.5), ws - 1);

    prevlines = NULL;
    for (i = 0; i < hd; i++) {
        lines = datas + srow[i] * wpls;
        lined = datad + i * wpld;
        if (lines != prevlines) {  /* make dest from new source row */
            prevxs = -1;
            sval = 0;
            csval = 0;
            if (d == 2) {
                for (j = 0; j < wd; j++) {
                    xs = scol[j];
                    if (xs != prevxs) {  /* get dest pix from source col */
                        sval = GET_DATA_DIBIT(lines, xs);
                        SET_DATA_DIBIT(lined, j, sval);
                        prevxs = xs;
                    } else {  /* copy prev dest pix */
                        SET_DATA_DIBIT(lined, j, sval);
                    }
                }
            } else if (d == 4) {
                for (j = 0; j < wd; j++) {
                    xs = scol[j];
                    if (xs != prevxs) {  /* get dest pix from source col */
                        sval = GET_DATA_QBIT(lines, xs);
                        SET_DATA_QBIT(lined, j, sval);
                        prevxs = xs;
                    } else {  /* copy prev dest pix */
                        SET_DATA_QBIT(lined, j, sval);
                    }
                }
            } else if (d == 8) {
                for (j = 0; j < wd; j++) {
                    xs = scol[j];
                    if (xs != prevxs) {  /* get dest pix from source col */
                        sval = GET_DATA_BYTE(lines, xs);
                        SET_DATA_BYTE(lined, j, sval);
                        prevxs = xs;
                    } else {  /* copy prev dest pix */
                        SET_DATA_BYTE(lined, j, sval);
                    }
                }
            } else if (d == 16) {
                for (j = 0; j < wd; j++) {
                    xs = scol[j];
                    if (xs != prevxs) {  /* get dest pix from source col */
                        sval = GET_DATA_TWO_BYTES(lines, xs);
                        SET_DATA_TWO_BYTES(lined, j, sval);
                        prevxs = xs;
                    } else {  /* copy prev dest pix */
                        SET_DATA_TWO_BYTES(lined, j, sval);
                    }
                }
            } else {  /* d == 32 */
                for (j = 0; j < wd; j++) {
                    xs = scol[j];
                    if (xs != prevxs) {  /* get dest pix from source col */
                        csval = lines[xs];
                        lined[j] = csval;
                        prevxs = xs;
                    } else {  /* copy prev dest pix */
                        lined[j] = csval;
                    }
                }
            }
        } else {  /* lines == prevlines; copy prev dest row */
            prevlined = lined - wpld;
            memcpy((char *)lined, (char *)prevlined, bpld);
        }
        prevlines = lines;
    }

    LEPT_FREE(srow);
    LEPT_FREE(scol);
    return 0;
}


/*------------------------------------------------------------------*
 *    Color and grayscale downsampling with (antialias) smoothing   *
 *------------------------------------------------------------------*/
/*!
 * \brief   scaleSmoothLow()
 *
 *  Notes:
 *      (1) This function is called on 8 or 32 bpp src and dest images.
 *      (2) size is the full width of the lowpass smoothing filter.
 *          It is correlated with the reduction ratio, being the
 *          nearest integer such that size is approximately equal to hs / hd.
 */
static l_int32
scaleSmoothLow(l_uint32  *datad,
               l_int32    wd,
               l_int32    hd,
               l_int32    wpld,
               l_uint32  *datas,
               l_int32    ws,
               l_int32    hs,
               l_int32    d,
               l_int32    wpls,
               l_int32    size)
{
l_int32    i, j, m, n, xstart;
l_int32    val, rval, gval, bval;
l_int32   *srow, *scol;
l_uint32  *lines, *lined, *line, *ppixel;
l_uint32   pixel;
l_float32  wratio, hratio, norm;

    PROCNAME("scaleSmoothLow");

        /* Clear dest */
    memset((char *)datad, 0, 4 * wpld * hd);

        /* Each dest pixel at (j,i) is computed as the average
           of size^2 corresponding src pixels.
           We store the UL corner location of the square of
           src pixels that correspond to dest pixel (j,i).
           The are labeled by the arrays srow[i] and scol[j]. */
    if ((srow = (l_int32 *)LEPT_CALLOC(hd, sizeof(l_int32))) == NULL)
        return ERROR_INT("srow not made", procName, 1);
    if ((scol = (l_int32 *)LEPT_CALLOC(wd, sizeof(l_int32))) == NULL) {
        LEPT_FREE(srow);
        return ERROR_INT("scol not made", procName, 1);
    }

    norm = 1. / (l_float32)(size * size);
    wratio = (l_float32)ws / (l_float32)wd;
    hratio = (l_float32)hs / (l_float32)hd;
    for (i = 0; i < hd; i++)
        srow[i] = L_MIN((l_int32)(hratio * i), hs - size);
    for (j = 0; j < wd; j++)
        scol[j] = L_MIN((l_int32)(wratio * j), ws - size);

        /* For each dest pixel, compute average */
    if (d == 8) {
        for (i = 0; i < hd; i++) {
            lines = datas + srow[i] * wpls;
            lined = datad + i * wpld;
            for (j = 0; j < wd; j++) {
                xstart = scol[j];
                val = 0;
                for (m = 0; m < size; m++) {
                    line = lines + m * wpls;
                    for (n = 0; n < size; n++) {
                        val += GET_DATA_BYTE(line, xstart + n);
                    }
                }
                val = (l_int32)((l_float32)val * norm);
                SET_DATA_BYTE(lined, j, val);
            }
        }
    } else {  /* d == 32 */
        for (i = 0; i < hd; i++) {
            lines = datas + srow[i] * wpls;
            lined = datad + i * wpld;
            for (j = 0; j < wd; j++) {
                xstart = scol[j];
                rval = gval = bval = 0;
                for (m = 0; m < size; m++) {
                    ppixel = lines + m * wpls + xstart;
                    for (n = 0; n < size; n++) {
                        pixel = *(ppixel + n);
                        rval += (pixel >> L_RED_SHIFT) & 0xff;
                        gval += (pixel >> L_GREEN_SHIFT) & 0xff;
                        bval += (pixel >> L_BLUE_SHIFT) & 0xff;
                    }
                }
                rval = (l_int32)((l_float32)rval * norm);
                gval = (l_int32)((l_float32)gval * norm);
                bval = (l_int32)((l_float32)bval * norm);
                *(lined + j) = rval << L_RED_SHIFT |
                               gval << L_GREEN_SHIFT |
                               bval << L_BLUE_SHIFT;
            }
        }
    }

    LEPT_FREE(srow);
    LEPT_FREE(scol);
    return 0;
}


/*!
 * \brief   scaleRGBToGray2Low()
 *
 *  Notes:
 *      (1) This function is called with 32 bpp RGB src and 8 bpp,
 *          half-resolution dest.  The weights should add to 1.0.
 */
static void
scaleRGBToGray2Low(l_uint32  *datad,
                   l_int32    wd,
                   l_int32    hd,
                   l_int32    wpld,
                   l_uint32  *datas,
                   l_int32    wpls,
                   l_float32  rwt,
                   l_float32  gwt,
                   l_float32  bwt)
{
l_int32    i, j, val, rval, gval, bval;
l_uint32  *lines, *lined;
l_uint32   pixel;

    rwt *= 0.25;
    gwt *= 0.25;
    bwt *= 0.25;
    for (i = 0; i < hd; i++) {
        lines = datas + 2 * i * wpls;
        lined = datad + i * wpld;
        for (j = 0; j < wd; j++) {
                /* Sum each of the color components from 4 src pixels */
            pixel = *(lines + 2 * j);
            rval = (pixel >> L_RED_SHIFT) & 0xff;
            gval = (pixel >> L_GREEN_SHIFT) & 0xff;
            bval = (pixel >> L_BLUE_SHIFT) & 0xff;
            pixel = *(lines + 2 * j + 1);
            rval += (pixel >> L_RED_SHIFT) & 0xff;
            gval += (pixel >> L_GREEN_SHIFT) & 0xff;
            bval += (pixel >> L_BLUE_SHIFT) & 0xff;
            pixel = *(lines + wpls + 2 * j);
            rval += (pixel >> L_RED_SHIFT) & 0xff;
            gval += (pixel >> L_GREEN_SHIFT) & 0xff;
            bval += (pixel >> L_BLUE_SHIFT) & 0xff;
            pixel = *(lines + wpls + 2 * j + 1);
            rval += (pixel >> L_RED_SHIFT) & 0xff;
            gval += (pixel >> L_GREEN_SHIFT) & 0xff;
            bval += (pixel >> L_BLUE_SHIFT) & 0xff;
                /* Generate the dest byte as a weighted sum of the averages */
            val = (l_int32)(rwt * rval + gwt * gval + bwt * bval);
            SET_DATA_BYTE(lined, j, val);
        }
    }
}


/*------------------------------------------------------------------*
 *                  General area mapped gray scaling                *
 *------------------------------------------------------------------*/
/*!
 * \brief   scaleColorAreaMapLow()
 *
 *  This should only be used for downscaling.
 *  We choose to divide each pixel into 16 x 16 sub-pixels.
 *  This is much slower than scaleSmoothLow(), but it gives a
 *  better representation, esp. for downscaling factors between
 *  1.5 and 5.  All src pixels are subdivided into 256 sub-pixels,
 *  and are weighted by the number of sub-pixels covered by
 *  the dest pixel.  This is about 2x slower than scaleSmoothLow(),
 *  but the results are significantly better on small text.
 */
static void
scaleColorAreaMapLow(l_uint32  *datad,
                    l_int32    wd,
                    l_int32    hd,
                    l_int32    wpld,
                    l_uint32  *datas,
                    l_int32    ws,
                    l_int32    hs,
                    l_int32    wpls)
{
l_int32    i, j, k, m, wm2, hm2;
l_int32    area00, area10, area01, area11, areal, arear, areat, areab;
l_int32    xu, yu;  /* UL corner in src image, to 1/16 of a pixel */
l_int32    xl, yl;  /* LR corner in src image, to 1/16 of a pixel */
l_int32    xup, yup, xuf, yuf;  /* UL src pixel: integer and fraction */
l_int32    xlp, ylp, xlf, ylf;  /* LR src pixel: integer and fraction */
l_int32    delx, dely, area;
l_int32    v00r, v00g, v00b;  /* contrib. from UL src pixel */
l_int32    v01r, v01g, v01b;  /* contrib. from LL src pixel */
l_int32    v10r, v10g, v10b;  /* contrib from UR src pixel */
l_int32    v11r, v11g, v11b;  /* contrib from LR src pixel */
l_int32    vinr, ving, vinb;  /* contrib from all full interior src pixels */
l_int32    vmidr, vmidg, vmidb;  /* contrib from side parts */
l_int32    rval, gval, bval;
l_uint32   pixel00, pixel10, pixel01, pixel11, pixel;
l_uint32  *lines, *lined;
l_float32  scx, scy;

        /* (scx, scy) are scaling factors that are applied to the
         * dest coords to get the corresponding src coords.
         * We need them because we iterate over dest pixels
         * and must find the corresponding set of src pixels. */
    scx = 16. * (l_float32)ws / (l_float32)wd;
    scy = 16. * (l_float32)hs / (l_float32)hd;
    wm2 = ws - 2;
    hm2 = hs - 2;

        /* Iterate over the destination pixels */
    for (i = 0; i < hd; i++) {
        yu = (l_int32)(scy * i);
        yl = (l_int32)(scy * (i + 1.0));
        yup = yu >> 4;
        yuf = yu & 0x0f;
        ylp = yl >> 4;
        ylf = yl & 0x0f;
        dely = ylp - yup;
        lined = datad + i * wpld;
        lines = datas + yup * wpls;
        for (j = 0; j < wd; j++) {
            xu = (l_int32)(scx * j);
            xl = (l_int32)(scx * (j + 1.0));
            xup = xu >> 4;
            xuf = xu & 0x0f;
            xlp = xl >> 4;
            xlf = xl & 0x0f;
            delx = xlp - xup;

                /* If near the edge, just use a src pixel value */
            if (xlp > wm2 || ylp > hm2) {
                *(lined + j) = *(lines + xup);
                continue;
            }

                /* Area summed over, in subpixels.  This varies
                 * due to the quantization, so we can't simply take
                 * the area to be a constant: area = scx * scy. */
            area = ((16 - xuf) + 16 * (delx - 1) + xlf) *
                   ((16 - yuf) + 16 * (dely - 1) + ylf);

                /* Do area map summation */
            pixel00 = *(lines + xup);
            pixel10 = *(lines + xlp);
            pixel01 = *(lines + dely * wpls +  xup);
            pixel11 = *(lines + dely * wpls +  xlp);
            area00 = (16 - xuf) * (16 - yuf);
            area10 = xlf * (16 - yuf);
            area01 = (16 - xuf) * ylf;
            area11 = xlf * ylf;
            v00r = area00 * ((pixel00 >> L_RED_SHIFT) & 0xff);
            v00g = area00 * ((pixel00 >> L_GREEN_SHIFT) & 0xff);
            v00b = area00 * ((pixel00 >> L_BLUE_SHIFT) & 0xff);
            v10r = area10 * ((pixel10 >> L_RED_SHIFT) & 0xff);
            v10g = area10 * ((pixel10 >> L_GREEN_SHIFT) & 0xff);
            v10b = area10 * ((pixel10 >> L_BLUE_SHIFT) & 0xff);
            v01r = area01 * ((pixel01 >> L_RED_SHIFT) & 0xff);
            v01g = area01 * ((pixel01 >> L_GREEN_SHIFT) & 0xff);
            v01b = area01 * ((pixel01 >> L_BLUE_SHIFT) & 0xff);
            v11r = area11 * ((pixel11 >> L_RED_SHIFT) & 0xff);
            v11g = area11 * ((pixel11 >> L_GREEN_SHIFT) & 0xff);
            v11b = area11 * ((pixel11 >> L_BLUE_SHIFT) & 0xff);
            vinr = ving = vinb = 0;
            for (k = 1; k < dely; k++) {  /* for full src pixels */
                for (m = 1; m < delx; m++) {
                    pixel = *(lines + k * wpls + xup + m);
                    vinr += 256 * ((pixel >> L_RED_SHIFT) & 0xff);
                    ving += 256 * ((pixel >> L_GREEN_SHIFT) & 0xff);
                    vinb += 256 * ((pixel >> L_BLUE_SHIFT) & 0xff);
                }
            }
            vmidr = vmidg = vmidb = 0;
            areal = (16 - xuf) * 16;
            arear = xlf * 16;
            areat = 16 * (16 - yuf);
            areab = 16 * ylf;
            for (k = 1; k < dely; k++) {  /* for left side */
                pixel = *(lines + k * wpls + xup);
                vmidr += areal * ((pixel >> L_RED_SHIFT) & 0xff);
                vmidg += areal * ((pixel >> L_GREEN_SHIFT) & 0xff);
                vmidb += areal * ((pixel >> L_BLUE_SHIFT) & 0xff);
            }
            for (k = 1; k < dely; k++) {  /* for right side */
                pixel = *(lines + k * wpls + xlp);
                vmidr += arear * ((pixel >> L_RED_SHIFT) & 0xff);
                vmidg += arear * ((pixel >> L_GREEN_SHIFT) & 0xff);
                vmidb += arear * ((pixel >> L_BLUE_SHIFT) & 0xff);
            }
            for (m = 1; m < delx; m++) {  /* for top side */
                pixel = *(lines + xup + m);
                vmidr += areat * ((pixel >> L_RED_SHIFT) & 0xff);
                vmidg += areat * ((pixel >> L_GREEN_SHIFT) & 0xff);
                vmidb += areat * ((pixel >> L_BLUE_SHIFT) & 0xff);
            }
            for (m = 1; m < delx; m++) {  /* for bottom side */
                pixel = *(lines + dely * wpls + xup + m);
                vmidr += areab * ((pixel >> L_RED_SHIFT) & 0xff);
                vmidg += areab * ((pixel >> L_GREEN_SHIFT) & 0xff);
                vmidb += areab * ((pixel >> L_BLUE_SHIFT) & 0xff);
            }

                /* Sum all the contributions */
            rval = (v00r + v01r + v10r + v11r + vinr + vmidr + 128) / area;
            gval = (v00g + v01g + v10g + v11g + ving + vmidg + 128) / area;
            bval = (v00b + v01b + v10b + v11b + vinb + vmidb + 128) / area;
#if  DEBUG_OVERFLOW
            if (rval > 255) fprintf(stderr, "rval ovfl: %d\n", rval);
            if (gval > 255) fprintf(stderr, "gval ovfl: %d\n", gval);
            if (bval > 255) fprintf(stderr, "bval ovfl: %d\n", bval);
#endif  /* DEBUG_OVERFLOW */
            composeRGBPixel(rval, gval, bval, lined + j);
        }
    }
}


/*!
 * \brief   scaleGrayAreaMapLow()
 *
 *  This should only be used for downscaling.
 *  We choose to divide each pixel into 16 x 16 sub-pixels.
 *  This is about 2x slower than scaleSmoothLow(), but the results
 *  are significantly better on small text, esp. for downscaling
 *  factors between 1.5 and 5.  All src pixels are subdivided
 *  into 256 sub-pixels, and are weighted by the number of
 *  sub-pixels covered by the dest pixel.
 */
static void
scaleGrayAreaMapLow(l_uint32  *datad,
                    l_int32    wd,
                    l_int32    hd,
                    l_int32    wpld,
                    l_uint32  *datas,
                    l_int32    ws,
                    l_int32    hs,
                    l_int32    wpls)
{
l_int32    i, j, k, m, wm2, hm2;
l_int32    xu, yu;  /* UL corner in src image, to 1/16 of a pixel */
l_int32    xl, yl;  /* LR corner in src image, to 1/16 of a pixel */
l_int32    xup, yup, xuf, yuf;  /* UL src pixel: integer and fraction */
l_int32    xlp, ylp, xlf, ylf;  /* LR src pixel: integer and fraction */
l_int32    delx, dely, area;
l_int32    v00;  /* contrib. from UL src pixel */
l_int32    v01;  /* contrib. from LL src pixel */
l_int32    v10;  /* contrib from UR src pixel */
l_int32    v11;  /* contrib from LR src pixel */
l_int32    vin;  /* contrib from all full interior src pixels */
l_int32    vmid;  /* contrib from side parts that are full in 1 direction */
l_int32    val;
l_uint32  *lines, *lined;
l_float32  scx, scy;

        /* (scx, scy) are scaling factors that are applied to the
         * dest coords to get the corresponding src coords.
         * We need them because we iterate over dest pixels
         * and must find the corresponding set of src pixels. */
    scx = 16. * (l_float32)ws / (l_float32)wd;
    scy = 16. * (l_float32)hs / (l_float32)hd;
    wm2 = ws - 2;
    hm2 = hs - 2;

        /* Iterate over the destination pixels */
    for (i = 0; i < hd; i++) {
        yu = (l_int32)(scy * i);
        yl = (l_int32)(scy * (i + 1.0));
        yup = yu >> 4;
        yuf = yu & 0x0f;
        ylp = yl >> 4;
        ylf = yl & 0x0f;
        dely = ylp - yup;
        lined = datad + i * wpld;
        lines = datas + yup * wpls;
        for (j = 0; j < wd; j++) {
            xu = (l_int32)(scx * j);
            xl = (l_int32)(scx * (j + 1.0));
            xup = xu >> 4;
            xuf = xu & 0x0f;
            xlp = xl >> 4;
            xlf = xl & 0x0f;
            delx = xlp - xup;

                /* If near the edge, just use a src pixel value */
            if (xlp > wm2 || ylp > hm2) {
                SET_DATA_BYTE(lined, j, GET_DATA_BYTE(lines, xup));
                continue;
            }

                /* Area summed over, in subpixels.  This varies
                 * due to the quantization, so we can't simply take
                 * the area to be a constant: area = scx * scy. */
            area = ((16 - xuf) + 16 * (delx - 1) + xlf) *
                   ((16 - yuf) + 16 * (dely - 1) + ylf);

                /* Do area map summation */
            v00 = (16 - xuf) * (16 - yuf) * GET_DATA_BYTE(lines, xup);
            v10 = xlf * (16 - yuf) * GET_DATA_BYTE(lines, xlp);
            v01 = (16 - xuf) * ylf * GET_DATA_BYTE(lines + dely * wpls, xup);
            v11 = xlf * ylf * GET_DATA_BYTE(lines + dely * wpls, xlp);
            for (vin = 0, k = 1; k < dely; k++) {  /* for full src pixels */
                 for (m = 1; m < delx; m++) {
                     vin += 256 * GET_DATA_BYTE(lines + k * wpls, xup + m);
                 }
            }
            for (vmid = 0, k = 1; k < dely; k++)  /* for left side */
                vmid += (16 - xuf) * 16 * GET_DATA_BYTE(lines + k * wpls, xup);
            for (k = 1; k < dely; k++)  /* for right side */
                vmid += xlf * 16 * GET_DATA_BYTE(lines + k * wpls, xlp);
            for (m = 1; m < delx; m++)  /* for top side */
                vmid += 16 * (16 - yuf) * GET_DATA_BYTE(lines, xup + m);
            for (m = 1; m < delx; m++)  /* for bottom side */
                vmid += 16 * ylf * GET_DATA_BYTE(lines + dely * wpls, xup + m);
            val = (v00 + v01 + v10 + v11 + vin + vmid + 128) / area;
#if  DEBUG_OVERFLOW
            if (val > 255) fprintf(stderr, "val overflow: %d\n", val);
#endif  /* DEBUG_OVERFLOW */
            SET_DATA_BYTE(lined, j, val);
        }
    }
}


/*------------------------------------------------------------------*
 *                     2x area mapped downscaling                   *
 *------------------------------------------------------------------*/
/*!
 * \brief   scaleAreaMapLow2()
 *
 *  Notes:
 *        This function is called with either 8 bpp gray or 32 bpp RGB.
 *        The result is a 2x reduced dest.
 */
static void
scaleAreaMapLow2(l_uint32  *datad,
                 l_int32    wd,
                 l_int32    hd,
                 l_int32    wpld,
                 l_uint32  *datas,
                 l_int32    d,
                 l_int32    wpls)
{
l_int32    i, j, val, rval, gval, bval;
l_uint32  *lines, *lined;
l_uint32   pixel;

    if (d == 8) {
        for (i = 0; i < hd; i++) {
            lines = datas + 2 * i * wpls;
            lined = datad + i * wpld;
            for (j = 0; j < wd; j++) {
                    /* Average each dest pixel using 4 src pixels */
                val = GET_DATA_BYTE(lines, 2 * j);
                val += GET_DATA_BYTE(lines, 2 * j + 1);
                val += GET_DATA_BYTE(lines + wpls, 2 * j);
                val += GET_DATA_BYTE(lines + wpls, 2 * j + 1);
                val >>= 2;
                SET_DATA_BYTE(lined, j, val);
            }
        }
    } else {  /* d == 32 */
        for (i = 0; i < hd; i++) {
            lines = datas + 2 * i * wpls;
            lined = datad + i * wpld;
            for (j = 0; j < wd; j++) {
                    /* Average each of the color components from 4 src pixels */
                pixel = *(lines + 2 * j);
                rval = (pixel >> L_RED_SHIFT) & 0xff;
                gval = (pixel >> L_GREEN_SHIFT) & 0xff;
                bval = (pixel >> L_BLUE_SHIFT) & 0xff;
                pixel = *(lines + 2 * j + 1);
                rval += (pixel >> L_RED_SHIFT) & 0xff;
                gval += (pixel >> L_GREEN_SHIFT) & 0xff;
                bval += (pixel >> L_BLUE_SHIFT) & 0xff;
                pixel = *(lines + wpls + 2 * j);
                rval += (pixel >> L_RED_SHIFT) & 0xff;
                gval += (pixel >> L_GREEN_SHIFT) & 0xff;
                bval += (pixel >> L_BLUE_SHIFT) & 0xff;
                pixel = *(lines + wpls + 2 * j + 1);
                rval += (pixel >> L_RED_SHIFT) & 0xff;
                gval += (pixel >> L_GREEN_SHIFT) & 0xff;
                bval += (pixel >> L_BLUE_SHIFT) & 0xff;
                composeRGBPixel(rval >> 2, gval >> 2, bval >> 2, &pixel);
                *(lined + j) = pixel;
            }
        }
    }
}


/*------------------------------------------------------------------*
 *              Binary scaling by closest pixel sampling            *
 *------------------------------------------------------------------*/
/*
 *  scaleBinaryLow()
 *
 *  Notes:
 *      (1) The dest must be cleared prior to this operation,
 *          and we clear it here in the low-level code.
 *      (2) We reuse dest pixels and dest pixel rows whenever
 *          possible for upscaling; downscaling is done by
 *          strict subsampling.
 */
static l_int32
scaleBinaryLow(l_uint32  *datad,
               l_int32    wd,
               l_int32    hd,
               l_int32    wpld,
               l_uint32  *datas,
               l_int32    ws,
               l_int32    hs,
               l_int32    wpls)
{
l_int32    i, j, bpld;
l_int32    xs, prevxs, sval;
l_int32   *srow, *scol;
l_uint32  *lines, *prevlines, *lined, *prevlined;
l_float32  wratio, hratio;

    PROCNAME("scaleBinaryLow");

        /* clear dest */
    bpld = 4 * wpld;
    memset((char *)datad, 0, hd * bpld);

        /* The source row corresponding to dest row i ==> srow[i]
         * The source col corresponding to dest col j ==> scol[j]  */
    if ((srow = (l_int32 *)LEPT_CALLOC(hd, sizeof(l_int32))) == NULL)
        return ERROR_INT("srow not made", procName, 1);
    if ((scol = (l_int32 *)LEPT_CALLOC(wd, sizeof(l_int32))) == NULL) {
        LEPT_FREE(srow);
        return ERROR_INT("scol not made", procName, 1);
    }

    wratio = (l_float32)ws / (l_float32)wd;
    hratio = (l_float32)hs / (l_float32)hd;
    for (i = 0; i < hd; i++)
        srow[i] = L_MIN((l_int32)(hratio * i + 0.5), hs - 1);
    for (j = 0; j < wd; j++)
        scol[j] = L_MIN((l_int32)(wratio * j + 0.5), ws - 1);

    prevlines = NULL;
    prevxs = -1;
    sval = 0;
    for (i = 0; i < hd; i++) {
        lines = datas + srow[i] * wpls;
        lined = datad + i * wpld;
        if (lines != prevlines) {  /* make dest from new source row */
            for (j = 0; j < wd; j++) {
                xs = scol[j];
                if (xs != prevxs) {  /* get dest pix from source col */
                    if ((sval = GET_DATA_BIT(lines, xs)))
                        SET_DATA_BIT(lined, j);
                    prevxs = xs;
                } else {  /* copy prev dest pix, if set */
                    if (sval)
                        SET_DATA_BIT(lined, j);
                }
            }
        } else {  /* lines == prevlines; copy prev dest row */
            prevlined = lined - wpld;
            memcpy((char *)lined, (char *)prevlined, bpld);
        }
        prevlines = lines;
    }

    LEPT_FREE(srow);
    LEPT_FREE(scol);
    return 0;
}