1 /*====================================================================*
2  -  Copyright (C) 2001 Leptonica.  All rights reserved.
3  -
4  -  Redistribution and use in source and binary forms, with or without
5  -  modification, are permitted provided that the following conditions
6  -  are met:
7  -  1. Redistributions of source code must retain the above copyright
8  -     notice, this list of conditions and the following disclaimer.
9  -  2. Redistributions in binary form must reproduce the above
10  -     copyright notice, this list of conditions and the following
11  -     disclaimer in the documentation and/or other materials
12  -     provided with the distribution.
13  -
14  -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15  -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16  -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17  -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
18  -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23  -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *====================================================================*/
26 
27 /*!
28  * \file  numafunc2.c
29  * <pre>
30  *
31  *      Morphological (min/max) operations
32  *          NUMA        *numaErode()
33  *          NUMA        *numaDilate()
34  *          NUMA        *numaOpen()
35  *          NUMA        *numaClose()
36  *
37  *      Other transforms
38  *          NUMA        *numaTransform()
39  *          l_int32      numaSimpleStats()
40  *          l_int32      numaWindowedStats()
41  *          NUMA        *numaWindowedMean()
42  *          NUMA        *numaWindowedMeanSquare()
43  *          l_int32      numaWindowedVariance()
44  *          NUMA        *numaWindowedMedian()
45  *          NUMA        *numaConvertToInt()
46  *
47  *      Histogram generation and statistics
48  *          NUMA        *numaMakeHistogram()
49  *          NUMA        *numaMakeHistogramAuto()
50  *          NUMA        *numaMakeHistogramClipped()
51  *          NUMA        *numaRebinHistogram()
52  *          NUMA        *numaNormalizeHistogram()
53  *          l_int32      numaGetStatsUsingHistogram()
54  *          l_int32      numaGetHistogramStats()
55  *          l_int32      numaGetHistogramStatsOnInterval()
56  *          l_int32      numaMakeRankFromHistogram()
57  *          l_int32      numaHistogramGetRankFromVal()
58  *          l_int32      numaHistogramGetValFromRank()
59  *          l_int32      numaDiscretizeRankAndIntensity()
60  *          l_int32      numaGetRankBinValues()
61  *
62  *      Splitting a distribution
63  *          l_int32      numaSplitDistribution()
64  *
65  *      Comparing histograms
66  *          l_int32      grayHistogramsToEMD()
67  *          l_int32      numaEarthMoverDistance()
68  *          l_int32      grayInterHistogramStats()
69  *
70  *      Extrema finding
71  *          NUMA        *numaFindPeaks()
72  *          NUMA        *numaFindExtrema()
73  *          l_int32     *numaCountReversals()
74  *
75  *      Threshold crossings and frequency analysis
76  *          l_int32      numaSelectCrossingThreshold()
77  *          NUMA        *numaCrossingsByThreshold()
78  *          NUMA        *numaCrossingsByPeaks()
79  *          NUMA        *numaEvalBestHaarParameters()
80  *          l_int32      numaEvalHaarSum()
81  *
82  *      Generating numbers in a range under constraints
83  *          NUMA        *genConstrainedNumaInRange()
84  *
85  *    Things to remember when using the Numa:
86  *
87  *    (1) The numa is a struct, not an array.  Always use accessors
88  *        (see numabasic.c), never the fields directly.
89  *
90  *    (2) The number array holds l_float32 values.  It can also
91  *        be used to store l_int32 values.  See numabasic.c for
92  *        details on using the accessors.  Integers larger than
93  *        about 10M will lose accuracy due on retrieval due to round-off.
94  *        For large integers, use the dna (array of l_float64) instead.
95  *
96  *    (3) Occasionally, in the comments we denote the i-th element of a
97  *        numa by na[i].  This is conceptual only -- the numa is not an array!
98  *
99  *    Some general comments on histograms:
100  *
101  *    (1) Histograms are the generic statistical representation of
102  *        the data about some attribute.  Typically they're not
103  *        normalized -- they simply give the number of occurrences
104  *        within each range of values of the attribute.  This range
105  *        of values is referred to as a 'bucket'.  For example,
106  *        the histogram could specify how many connected components
107  *        are found for each value of their width; in that case,
108  *        the bucket size is 1.
109  *
110  *    (2) In leptonica, all buckets have the same size.  Histograms
111  *        are therefore specified by a numa of occurrences, along
112  *        with two other numbers: the 'value' associated with the
113  *        occupants of the first bucket and the size (i.e., 'width')
114  *        of each bucket.  These two numbers then allow us to calculate
115  *        the value associated with the occupants of each bucket.
116  *        These numbers are fields in the numa, initialized to
117  *        a startx value of 0.0 and a binsize of 1.0.  Accessors for
118  *        these fields are functions numa*Parameters().  All histograms
119  *        must have these two numbers properly set.
120  * </pre>
121  */
122 
123 #include <math.h>
124 #include "allheaders.h"
125 
126     /* bin sizes in numaMakeHistogram() */
127 static const l_int32 BinSizeArray[] = {2, 5, 10, 20, 50, 100, 200, 500, 1000,\
128                       2000, 5000, 10000, 20000, 50000, 100000, 200000,\
129                       500000, 1000000, 2000000, 5000000, 10000000,\
130                       200000000, 50000000, 100000000};
131 static const l_int32 NBinSizes = 24;
132 
133 
134 #ifndef  NO_CONSOLE_IO
135 #define  DEBUG_HISTO        0
136 #define  DEBUG_CROSSINGS    0
137 #define  DEBUG_FREQUENCY    0
138 #endif  /* ~NO_CONSOLE_IO */
139 
140 
141 /*----------------------------------------------------------------------*
142  *                     Morphological operations                         *
143  *----------------------------------------------------------------------*/
144 /*!
145  * \brief   numaErode()
146  *
147  * \param[in]    nas
148  * \param[in]    size of sel; greater than 0, odd; origin implicitly in center
149  * \return  nad eroded, or NULL on error
150  *
151  * <pre>
152  * Notes:
153  *      (1) The structuring element (sel) is linear, all "hits"
154  *      (2) If size == 1, this returns a copy
155  *      (3) General comment.  The morphological operations are equivalent
156  *          to those that would be performed on a 1-dimensional fpix.
157  *          However, because we have not implemented morphological
158  *          operations on fpix, we do this here.  Because it is only
159  *          1 dimensional, there is no reason to use the more
160  *          complicated van Herk/Gil-Werman algorithm, and we do it
161  *          by brute force.
162  * </pre>
163  */
164 NUMA *
numaErode(NUMA * nas,l_int32 size)165 numaErode(NUMA    *nas,
166           l_int32  size)
167 {
168 l_int32     i, j, n, hsize, len;
169 l_float32   minval;
170 l_float32  *fa, *fas, *fad;
171 NUMA       *nad;
172 
173     PROCNAME("numaErode");
174 
175     if (!nas)
176         return (NUMA *)ERROR_PTR("nas not defined", procName, NULL);
177     if (size <= 0)
178         return (NUMA *)ERROR_PTR("size must be > 0", procName, NULL);
179     if ((size & 1) == 0 ) {
180         L_WARNING("sel size must be odd; increasing by 1\n", procName);
181         size++;
182     }
183 
184     if (size == 1)
185         return numaCopy(nas);
186 
187         /* Make a source fa (fas) that has an added (size / 2) boundary
188          * on left and right, contains a copy of nas in the interior region
189          * (between 'size' and 'size + n', and has large values
190          * inserted in the boundary (because it is an erosion). */
191     n = numaGetCount(nas);
192     hsize = size / 2;
193     len = n + 2 * hsize;
194     if ((fas = (l_float32 *)LEPT_CALLOC(len, sizeof(l_float32))) == NULL)
195         return (NUMA *)ERROR_PTR("fas not made", procName, NULL);
196     for (i = 0; i < hsize; i++)
197          fas[i] = 1.0e37;
198     for (i = hsize + n; i < len; i++)
199          fas[i] = 1.0e37;
200     fa = numaGetFArray(nas, L_NOCOPY);
201     for (i = 0; i < n; i++)
202          fas[hsize + i] = fa[i];
203 
204     nad = numaMakeConstant(0, n);
205     numaCopyParameters(nad, nas);
206     fad = numaGetFArray(nad, L_NOCOPY);
207     for (i = 0; i < n; i++) {
208         minval = 1.0e37;  /* start big */
209         for (j = 0; j < size; j++)
210             minval = L_MIN(minval, fas[i + j]);
211         fad[i] = minval;
212     }
213 
214     LEPT_FREE(fas);
215     return nad;
216 }
217 
218 
219 /*!
220  * \brief   numaDilate()
221  *
222  * \param[in]    nas
223  * \param[in]    size of sel; greater than 0, odd; origin implicitly in center
224  * \return  nad dilated, or NULL on error
225  *
226  * <pre>
227  * Notes:
228  *      (1) The structuring element (sel) is linear, all "hits"
229  *      (2) If size == 1, this returns a copy
230  * </pre>
231  */
232 NUMA *
numaDilate(NUMA * nas,l_int32 size)233 numaDilate(NUMA    *nas,
234            l_int32  size)
235 {
236 l_int32     i, j, n, hsize, len;
237 l_float32   maxval;
238 l_float32  *fa, *fas, *fad;
239 NUMA       *nad;
240 
241     PROCNAME("numaDilate");
242 
243     if (!nas)
244         return (NUMA *)ERROR_PTR("nas not defined", procName, NULL);
245     if (size <= 0)
246         return (NUMA *)ERROR_PTR("size must be > 0", procName, NULL);
247     if ((size & 1) == 0 ) {
248         L_WARNING("sel size must be odd; increasing by 1\n", procName);
249         size++;
250     }
251 
252     if (size == 1)
253         return numaCopy(nas);
254 
255         /* Make a source fa (fas) that has an added (size / 2) boundary
256          * on left and right, contains a copy of nas in the interior region
257          * (between 'size' and 'size + n', and has small values
258          * inserted in the boundary (because it is a dilation). */
259     n = numaGetCount(nas);
260     hsize = size / 2;
261     len = n + 2 * hsize;
262     if ((fas = (l_float32 *)LEPT_CALLOC(len, sizeof(l_float32))) == NULL)
263         return (NUMA *)ERROR_PTR("fas not made", procName, NULL);
264     for (i = 0; i < hsize; i++)
265          fas[i] = -1.0e37;
266     for (i = hsize + n; i < len; i++)
267          fas[i] = -1.0e37;
268     fa = numaGetFArray(nas, L_NOCOPY);
269     for (i = 0; i < n; i++)
270          fas[hsize + i] = fa[i];
271 
272     nad = numaMakeConstant(0, n);
273     numaCopyParameters(nad, nas);
274     fad = numaGetFArray(nad, L_NOCOPY);
275     for (i = 0; i < n; i++) {
276         maxval = -1.0e37;  /* start small */
277         for (j = 0; j < size; j++)
278             maxval = L_MAX(maxval, fas[i + j]);
279         fad[i] = maxval;
280     }
281 
282     LEPT_FREE(fas);
283     return nad;
284 }
285 
286 
287 /*!
288  * \brief   numaOpen()
289  *
290  * \param[in]    nas
291  * \param[in]    size of sel; greater than 0, odd; origin implicitly in center
292  * \return  nad opened, or NULL on error
293  *
294  * <pre>
295  * Notes:
296  *      (1) The structuring element (sel) is linear, all "hits"
297  *      (2) If size == 1, this returns a copy
298  * </pre>
299  */
300 NUMA *
numaOpen(NUMA * nas,l_int32 size)301 numaOpen(NUMA    *nas,
302          l_int32  size)
303 {
304 NUMA  *nat, *nad;
305 
306     PROCNAME("numaOpen");
307 
308     if (!nas)
309         return (NUMA *)ERROR_PTR("nas not defined", procName, NULL);
310     if (size <= 0)
311         return (NUMA *)ERROR_PTR("size must be > 0", procName, NULL);
312     if ((size & 1) == 0 ) {
313         L_WARNING("sel size must be odd; increasing by 1\n", procName);
314         size++;
315     }
316 
317     if (size == 1)
318         return numaCopy(nas);
319 
320     nat = numaErode(nas, size);
321     nad = numaDilate(nat, size);
322     numaDestroy(&nat);
323     return nad;
324 }
325 
326 
327 /*!
328  * \brief   numaClose()
329  *
330  * \param[in]    nas
331  * \param[in]    size of sel; greater than 0, odd; origin implicitly in center
332  * \return  nad opened, or NULL on error
333  *
334  * <pre>
335  * Notes:
336  *      (1) The structuring element (sel) is linear, all "hits"
337  *      (2) If size == 1, this returns a copy
338  *      (3) We add a border before doing this operation, for the same
339  *          reason that we add a border to a pix before doing a safe closing.
340  *          Without the border, a small component near the border gets
341  *          clipped at the border on dilation, and can be entirely removed
342  *          by the following erosion, violating the basic extensivity
343  *          property of closing.
344  * </pre>
345  */
346 NUMA *
numaClose(NUMA * nas,l_int32 size)347 numaClose(NUMA    *nas,
348           l_int32  size)
349 {
350 NUMA  *nab, *nat1, *nat2, *nad;
351 
352     PROCNAME("numaClose");
353 
354     if (!nas)
355         return (NUMA *)ERROR_PTR("nas not defined", procName, NULL);
356     if (size <= 0)
357         return (NUMA *)ERROR_PTR("size must be > 0", procName, NULL);
358     if ((size & 1) == 0 ) {
359         L_WARNING("sel size must be odd; increasing by 1\n", procName);
360         size++;
361     }
362 
363     if (size == 1)
364         return numaCopy(nas);
365 
366     nab = numaAddBorder(nas, size, size, 0);  /* to preserve extensivity */
367     nat1 = numaDilate(nab, size);
368     nat2 = numaErode(nat1, size);
369     nad = numaRemoveBorder(nat2, size, size);
370     numaDestroy(&nab);
371     numaDestroy(&nat1);
372     numaDestroy(&nat2);
373     return nad;
374 }
375 
376 
377 /*----------------------------------------------------------------------*
378  *                            Other transforms                          *
379  *----------------------------------------------------------------------*/
380 /*!
381  * \brief   numaTransform()
382  *
383  * \param[in]    nas
384  * \param[in]    shift add this to each number
385  * \param[in]    scale multiply each number by this
386  * \return  nad with all values shifted and scaled, or NULL on error
387  *
388  * <pre>
389  * Notes:
390  *      (1) Each number is shifted before scaling.
391  * </pre>
392  */
393 NUMA *
numaTransform(NUMA * nas,l_float32 shift,l_float32 scale)394 numaTransform(NUMA      *nas,
395               l_float32  shift,
396               l_float32  scale)
397 {
398 l_int32    i, n;
399 l_float32  val;
400 NUMA      *nad;
401 
402     PROCNAME("numaTransform");
403 
404     if (!nas)
405         return (NUMA *)ERROR_PTR("nas not defined", procName, NULL);
406     n = numaGetCount(nas);
407     if ((nad = numaCreate(n)) == NULL)
408         return (NUMA *)ERROR_PTR("nad not made", procName, NULL);
409     numaCopyParameters(nad, nas);
410     for (i = 0; i < n; i++) {
411         numaGetFValue(nas, i, &val);
412         val = scale * (val + shift);
413         numaAddNumber(nad, val);
414     }
415     return nad;
416 }
417 
418 
419 /*!
420  * \brief   numaSimpleStats()
421  *
422  * \param[in]    na input numa
423  * \param[in]    first first element to use
424  * \param[in]    last last element to use; 0 to go to the end
425  * \param[out]   pmean [optional] mean value
426  * \param[out]   pvar [optional] variance
427  * \param[out]   prvar [optional] rms deviation from the mean
428  * \return  0 if OK, 1 on error
429  */
430 l_int32
numaSimpleStats(NUMA * na,l_int32 first,l_int32 last,l_float32 * pmean,l_float32 * pvar,l_float32 * prvar)431 numaSimpleStats(NUMA       *na,
432                 l_int32     first,
433                 l_int32     last,
434                 l_float32  *pmean,
435                 l_float32  *pvar,
436                 l_float32  *prvar)
437 {
438 l_int32    i, n, ni;
439 l_float32  sum, sumsq, val, mean, var;
440 
441     PROCNAME("numaSimpleStats");
442 
443     if (pmean) *pmean = 0.0;
444     if (pvar) *pvar = 0.0;
445     if (prvar) *prvar = 0.0;
446     if (!pmean && !pvar && !prvar)
447         return ERROR_INT("nothing requested", procName, 1);
448     if (!na)
449         return ERROR_INT("na not defined", procName, 1);
450     if ((n = numaGetCount(na)) == 0)
451         return ERROR_INT("na is empty", procName, 1);
452     if (last == 0) last = n - 1;
453     last = L_MIN(last, n - 1);
454     if (first > last) {
455         L_ERROR("invalid: first(%d) > last(%d)\n", procName, first, last);
456         return 1;
457     }
458     ni = last - first + 1;
459     sum = sumsq = 0.0;
460     for (i = first; i <= last; i++) {
461         numaGetFValue(na, i, &val);
462         sum += val;
463         sumsq += val * val;
464     }
465 
466     mean = sum / ni;
467     if (pmean)
468         *pmean = mean;
469     if (pvar || prvar) {
470         var = sumsq / ni - mean * mean;
471         if (pvar) *pvar = var;
472         if (prvar) *prvar = sqrtf(var);
473     }
474 
475     return 0;
476 }
477 
478 
479 /*!
480  * \brief   numaWindowedStats()
481  *
482  * \param[in]    nas input numa
483  * \param[in]    wc half width of the window
484  * \param[out]   pnam [optional] mean value in window
485  * \param[out]   pnams [optional] mean square value in window
486  * \param[out]   pnav [optional] variance in window
487  * \param[out]   pnarv [optional] rms deviation from the mean
488  * \return  0 if OK, 1 on error
489  *
490  * <pre>
491  * Notes:
492  *      (1) This is a high-level convenience function for calculating
493  *          any or all of these derived arrays.
494  *      (2) These statistical measures over the values in the
495  *          rectangular window are:
496  *            ~ average value: <x>  (nam)
497  *            ~ average squared value: <x*x> (nams)
498  *            ~ variance: <(x - <x>)*(x - <x>)> = <x*x> - <x>*<x>  (nav)
499  *            ~ square-root of variance: (narv)
500  *          where the brackets < .. > indicate that the average value is
501  *          to be taken over the window.
502  *      (3) Note that the variance is just the mean square difference from
503  *          the mean value; and the square root of the variance is the
504  *          root mean square difference from the mean, sometimes also
505  *          called the 'standard deviation'.
506  *      (4) Internally, use mirrored borders to handle values near the
507  *          end of each array.
508  * </pre>
509  */
510 l_int32
numaWindowedStats(NUMA * nas,l_int32 wc,NUMA ** pnam,NUMA ** pnams,NUMA ** pnav,NUMA ** pnarv)511 numaWindowedStats(NUMA    *nas,
512                   l_int32  wc,
513                   NUMA   **pnam,
514                   NUMA   **pnams,
515                   NUMA   **pnav,
516                   NUMA   **pnarv)
517 {
518 NUMA  *nam, *nams;
519 
520     PROCNAME("numaWindowedStats");
521 
522     if (!nas)
523         return ERROR_INT("nas not defined", procName, 1);
524     if (2 * wc + 1 > numaGetCount(nas))
525         L_WARNING("filter wider than input array!\n", procName);
526 
527     if (!pnav && !pnarv) {
528         if (pnam) *pnam = numaWindowedMean(nas, wc);
529         if (pnams) *pnams = numaWindowedMeanSquare(nas, wc);
530         return 0;
531     }
532 
533     nam = numaWindowedMean(nas, wc);
534     nams = numaWindowedMeanSquare(nas, wc);
535     numaWindowedVariance(nam, nams, pnav, pnarv);
536     if (pnam)
537         *pnam = nam;
538     else
539         numaDestroy(&nam);
540     if (pnams)
541         *pnams = nams;
542     else
543         numaDestroy(&nams);
544     return 0;
545 }
546 
547 
548 /*!
549  * \brief   numaWindowedMean()
550  *
551  * \param[in]    nas
552  * \param[in]    wc half width of the convolution window
553  * \return  nad after low-pass filtering, or NULL on error
554  *
555  * <pre>
556  * Notes:
557  *      (1) This is a convolution.  The window has width = 2 * %wc + 1.
558  *      (2) We add a mirrored border of size %wc to each end of the array.
559  * </pre>
560  */
561 NUMA *
numaWindowedMean(NUMA * nas,l_int32 wc)562 numaWindowedMean(NUMA    *nas,
563                  l_int32  wc)
564 {
565 l_int32     i, n, n1, width;
566 l_float32   sum, norm;
567 l_float32  *fa1, *fad, *suma;
568 NUMA       *na1, *nad;
569 
570     PROCNAME("numaWindowedMean");
571 
572     if (!nas)
573         return (NUMA *)ERROR_PTR("nas not defined", procName, NULL);
574     n = numaGetCount(nas);
575     width = 2 * wc + 1;  /* filter width */
576     if (width > n)
577         L_WARNING("filter wider than input array!\n", procName);
578 
579     na1 = numaAddSpecifiedBorder(nas, wc, wc, L_MIRRORED_BORDER);
580     n1 = n + 2 * wc;
581     fa1 = numaGetFArray(na1, L_NOCOPY);
582     nad = numaMakeConstant(0, n);
583     fad = numaGetFArray(nad, L_NOCOPY);
584 
585         /* Make sum array; note the indexing */
586     if ((suma = (l_float32 *)LEPT_CALLOC(n1 + 1, sizeof(l_float32))) == NULL) {
587         numaDestroy(&na1);
588         numaDestroy(&nad);
589         return (NUMA *)ERROR_PTR("suma not made", procName, NULL);
590     }
591     sum = 0.0;
592     suma[0] = 0.0;
593     for (i = 0; i < n1; i++) {
594         sum += fa1[i];
595         suma[i + 1] = sum;
596     }
597 
598     norm = 1. / (2 * wc + 1);
599     for (i = 0; i < n; i++)
600         fad[i] = norm * (suma[width + i] - suma[i]);
601 
602     LEPT_FREE(suma);
603     numaDestroy(&na1);
604     return nad;
605 }
606 
607 
608 /*!
609  * \brief   numaWindowedMeanSquare()
610  *
611  * \param[in]    nas
612  * \param[in]    wc half width of the window
613  * \return  nad containing windowed mean square values, or NULL on error
614  *
615  * <pre>
616  * Notes:
617  *      (1) The window has width = 2 * %wc + 1.
618  *      (2) We add a mirrored border of size %wc to each end of the array.
619  * </pre>
620  */
621 NUMA *
numaWindowedMeanSquare(NUMA * nas,l_int32 wc)622 numaWindowedMeanSquare(NUMA    *nas,
623                        l_int32  wc)
624 {
625 l_int32     i, n, n1, width;
626 l_float32   sum, norm;
627 l_float32  *fa1, *fad, *suma;
628 NUMA       *na1, *nad;
629 
630     PROCNAME("numaWindowedMeanSquare");
631 
632     if (!nas)
633         return (NUMA *)ERROR_PTR("nas not defined", procName, NULL);
634     n = numaGetCount(nas);
635     width = 2 * wc + 1;  /* filter width */
636     if (width > n)
637         L_WARNING("filter wider than input array!\n", procName);
638 
639     na1 = numaAddSpecifiedBorder(nas, wc, wc, L_MIRRORED_BORDER);
640     n1 = n + 2 * wc;
641     fa1 = numaGetFArray(na1, L_NOCOPY);
642     nad = numaMakeConstant(0, n);
643     fad = numaGetFArray(nad, L_NOCOPY);
644 
645         /* Make sum array; note the indexing */
646     if ((suma = (l_float32 *)LEPT_CALLOC(n1 + 1, sizeof(l_float32))) == NULL) {
647         numaDestroy(&na1);
648         numaDestroy(&nad);
649         return (NUMA *)ERROR_PTR("suma not made", procName, NULL);
650     }
651     sum = 0.0;
652     suma[0] = 0.0;
653     for (i = 0; i < n1; i++) {
654         sum += fa1[i] * fa1[i];
655         suma[i + 1] = sum;
656     }
657 
658     norm = 1. / (2 * wc + 1);
659     for (i = 0; i < n; i++)
660         fad[i] = norm * (suma[width + i] - suma[i]);
661 
662     LEPT_FREE(suma);
663     numaDestroy(&na1);
664     return nad;
665 }
666 
667 
668 /*!
669  * \brief   numaWindowedVariance()
670  *
671  * \param[in]    nam windowed mean values
672  * \param[in]    nams windowed mean square values
673  * \param[out]   pnav [optional] numa of variance -- the ms deviation
674  *                     from the mean
675  * \param[out]   pnarv [optional] numa of rms deviation from the mean
676  * \return  0 if OK, 1 on error
677  *
678  * <pre>
679  * Notes:
680  *      (1) The numas of windowed mean and mean square are precomputed,
681  *          using numaWindowedMean() and numaWindowedMeanSquare().
682  *      (2) Either or both of the variance and square-root of variance
683  *          are returned, where the variance is the average over the
684  *          window of the mean square difference of the pixel value
685  *          from the mean:
686  *                <(x - <x>)*(x - <x>)> = <x*x> - <x>*<x>
687  * </pre>
688  */
689 l_int32
numaWindowedVariance(NUMA * nam,NUMA * nams,NUMA ** pnav,NUMA ** pnarv)690 numaWindowedVariance(NUMA   *nam,
691                      NUMA   *nams,
692                      NUMA  **pnav,
693                      NUMA  **pnarv)
694 {
695 l_int32     i, nm, nms;
696 l_float32   var;
697 l_float32  *fam, *fams, *fav, *farv;
698 NUMA       *nav, *narv;  /* variance and square root of variance */
699 
700     PROCNAME("numaWindowedVariance");
701 
702     if (pnav) *pnav = NULL;
703     if (pnarv) *pnarv = NULL;
704     if (!pnav && !pnarv)
705         return ERROR_INT("neither &nav nor &narv are defined", procName, 1);
706     if (!nam)
707         return ERROR_INT("nam not defined", procName, 1);
708     if (!nams)
709         return ERROR_INT("nams not defined", procName, 1);
710     nm = numaGetCount(nam);
711     nms = numaGetCount(nams);
712     if (nm != nms)
713         return ERROR_INT("sizes of nam and nams differ", procName, 1);
714 
715     if (pnav) {
716         nav = numaMakeConstant(0, nm);
717         *pnav = nav;
718         fav = numaGetFArray(nav, L_NOCOPY);
719     }
720     if (pnarv) {
721         narv = numaMakeConstant(0, nm);
722         *pnarv = narv;
723         farv = numaGetFArray(narv, L_NOCOPY);
724     }
725     fam = numaGetFArray(nam, L_NOCOPY);
726     fams = numaGetFArray(nams, L_NOCOPY);
727 
728     for (i = 0; i < nm; i++) {
729         var = fams[i] - fam[i] * fam[i];
730         if (pnav)
731             fav[i] = var;
732         if (pnarv)
733             farv[i] = sqrtf(var);
734     }
735 
736     return 0;
737 }
738 
739 
740 /*!
741  * \brief   numaWindowedMedian()
742  *
743  * \param[in]    nas
744  * \param[in]    halfwin half width of window over which the median is found
745  * \return  nad after windowed median filtering, or NULL on error
746  *
747  * <pre>
748  * Notes:
749  *      (1) The requested window has width = 2 * %halfwin + 1.
750  *      (2) If the input nas has less then 3 elements, return a copy.
751  *      (3) If the filter is too small (%halfwin <= 0), return a copy.
752  *      (4) If the filter is too large, it is reduced in size.
753  *      (5) We add a mirrored border of size %halfwin to each end of
754  *          the array to simplify the calculation by avoiding end-effects.
755  * </pre>
756  */
757 NUMA *
numaWindowedMedian(NUMA * nas,l_int32 halfwin)758 numaWindowedMedian(NUMA    *nas,
759                    l_int32  halfwin)
760 {
761 l_int32    i, n;
762 l_float32  medval;
763 NUMA      *na1, *na2, *nad;
764 
765     PROCNAME("numaWindowedMedian");
766 
767     if (!nas)
768         return (NUMA *)ERROR_PTR("nas not defined", procName, NULL);
769     if ((n = numaGetCount(nas)) < 3)
770         return numaCopy(nas);
771     if (halfwin <= 0) {
772         L_ERROR("filter too small; returning a copy\n", procName);
773         return numaCopy(nas);
774     }
775 
776     if (halfwin > (n - 1) / 2) {
777         halfwin = (n - 1) / 2;
778         L_INFO("reducing filter to halfwin = %d\n", procName, halfwin);
779     }
780 
781         /* Add a border to both ends */
782     na1 = numaAddSpecifiedBorder(nas, halfwin, halfwin, L_MIRRORED_BORDER);
783 
784         /* Get the median value at the center of each window, corresponding
785          * to locations in the input nas. */
786     nad = numaCreate(n);
787     for (i = 0; i < n; i++) {
788         na2 = numaClipToInterval(na1, i, i + 2 * halfwin);
789         numaGetMedian(na2, &medval);
790         numaAddNumber(nad, medval);
791         numaDestroy(&na2);
792     }
793 
794     numaDestroy(&na1);
795     return nad;
796 }
797 
798 
799 /*!
800  * \brief   numaConvertToInt()
801  *
802  * \param[in]    nas source numa
803  * \return  na with all values rounded to nearest integer, or
804  *              NULL on error
805  */
806 NUMA *
numaConvertToInt(NUMA * nas)807 numaConvertToInt(NUMA  *nas)
808 {
809 l_int32  i, n, ival;
810 NUMA    *nad;
811 
812     PROCNAME("numaConvertToInt");
813 
814     if (!nas)
815         return (NUMA *)ERROR_PTR("nas not defined", procName, NULL);
816 
817     n = numaGetCount(nas);
818     if ((nad = numaCreate(n)) == NULL)
819         return (NUMA *)ERROR_PTR("nad not made", procName, NULL);
820     numaCopyParameters(nad, nas);
821     for (i = 0; i < n; i++) {
822         numaGetIValue(nas, i, &ival);
823         numaAddNumber(nad, ival);
824     }
825     return nad;
826 }
827 
828 
829 /*----------------------------------------------------------------------*
830  *                 Histogram generation and statistics                  *
831  *----------------------------------------------------------------------*/
832 /*!
833  * \brief   numaMakeHistogram()
834  *
835  * \param[in]    na
836  * \param[in]    maxbins max number of histogram bins
837  * \param[out]   pbinsize  size of histogram bins
838  * \param[out]   pbinstart [optional] start val of minimum bin;
839  *                         input NULL to force start at 0
840  * \return  na consisiting of histogram of integerized values,
841  *              or NULL on error.
842  *
843  * <pre>
844  * Notes:
845  *      (1) This simple interface is designed for integer data.
846  *          The bins are of integer width and start on integer boundaries,
847  *          so the results on float data will not have high precision.
848  *      (2) Specify the max number of input bins.   Then %binsize,
849  *          the size of bins necessary to accommodate the input data,
850  *          is returned.  It is one of the sequence:
851  *                {1, 2, 5, 10, 20, 50, ...}.
852  *      (3) If &binstart is given, all values are accommodated,
853  *          and the min value of the starting bin is returned.
854  *          Otherwise, all negative values are discarded and
855  *          the histogram bins start at 0.
856  * </pre>
857  */
858 NUMA *
numaMakeHistogram(NUMA * na,l_int32 maxbins,l_int32 * pbinsize,l_int32 * pbinstart)859 numaMakeHistogram(NUMA     *na,
860                   l_int32   maxbins,
861                   l_int32  *pbinsize,
862                   l_int32  *pbinstart)
863 {
864 l_int32    i, n, ival, hval;
865 l_int32    iminval, imaxval, range, binsize, nbins, ibin;
866 l_float32  val, ratio;
867 NUMA      *nai, *nahist;
868 
869     PROCNAME("numaMakeHistogram");
870 
871     if (!na)
872         return (NUMA *)ERROR_PTR("na not defined", procName, NULL);
873     if (!pbinsize)
874         return (NUMA *)ERROR_PTR("&binsize not defined", procName, NULL);
875 
876         /* Determine input range */
877     numaGetMin(na, &val, NULL);
878     iminval = (l_int32)(val + 0.5);
879     numaGetMax(na, &val, NULL);
880     imaxval = (l_int32)(val + 0.5);
881     if (pbinstart == NULL) {  /* clip negative vals; start from 0 */
882         iminval = 0;
883         if (imaxval < 0)
884             return (NUMA *)ERROR_PTR("all values < 0", procName, NULL);
885     }
886 
887         /* Determine binsize */
888     range = imaxval - iminval + 1;
889     if (range > maxbins - 1) {
890         ratio = (l_float64)range / (l_float64)maxbins;
891         binsize = 0;
892         for (i = 0; i < NBinSizes; i++) {
893             if (ratio < BinSizeArray[i]) {
894                 binsize = BinSizeArray[i];
895                 break;
896             }
897         }
898         if (binsize == 0)
899             return (NUMA *)ERROR_PTR("numbers too large", procName, NULL);
900     } else {
901         binsize = 1;
902     }
903     *pbinsize = binsize;
904     nbins = 1 + range / binsize;  /* +1 seems to be sufficient */
905 
906         /* Redetermine iminval */
907     if (pbinstart && binsize > 1) {
908         if (iminval >= 0)
909             iminval = binsize * (iminval / binsize);
910         else
911             iminval = binsize * ((iminval - binsize + 1) / binsize);
912     }
913     if (pbinstart)
914         *pbinstart = iminval;
915 
916 #if  DEBUG_HISTO
917     fprintf(stderr, " imaxval = %d, range = %d, nbins = %d\n",
918             imaxval, range, nbins);
919 #endif  /* DEBUG_HISTO */
920 
921         /* Use integerized data for input */
922     if ((nai = numaConvertToInt(na)) == NULL)
923         return (NUMA *)ERROR_PTR("nai not made", procName, NULL);
924     n = numaGetCount(nai);
925 
926         /* Make histogram, converting value in input array
927          * into a bin number for this histogram array. */
928     if ((nahist = numaCreate(nbins)) == NULL) {
929         numaDestroy(&nai);
930         return (NUMA *)ERROR_PTR("nahist not made", procName, NULL);
931     }
932     numaSetCount(nahist, nbins);
933     numaSetParameters(nahist, iminval, binsize);
934     for (i = 0; i < n; i++) {
935         numaGetIValue(nai, i, &ival);
936         ibin = (ival - iminval) / binsize;
937         if (ibin >= 0 && ibin < nbins) {
938             numaGetIValue(nahist, ibin, &hval);
939             numaSetValue(nahist, ibin, hval + 1.0);
940         }
941     }
942 
943     numaDestroy(&nai);
944     return nahist;
945 }
946 
947 
948 /*!
949  * \brief   numaMakeHistogramAuto()
950  *
951  * \param[in]    na numa of floats; these may be integers
952  * \param[in]    maxbins max number of histogram bins; >= 1
953  * \return  na consisiting of histogram of quantized float values,
954  *              or NULL on error.
955  *
956  * <pre>
957  * Notes:
958  *      (1) This simple interface is designed for accurate binning
959  *          of both integer and float data.
960  *      (2) If the array data is integers, and the range of integers
961  *          is smaller than %maxbins, they are binned as they fall,
962  *          with binsize = 1.
963  *      (3) If the range of data, (maxval - minval), is larger than
964  *          %maxbins, or if the data is floats, they are binned into
965  *          exactly %maxbins bins.
966  *      (4) Unlike numaMakeHistogram(), these bins in general have
967  *          non-integer location and width, even for integer data.
968  * </pre>
969  */
970 NUMA *
numaMakeHistogramAuto(NUMA * na,l_int32 maxbins)971 numaMakeHistogramAuto(NUMA    *na,
972                       l_int32  maxbins)
973 {
974 l_int32    i, n, imin, imax, irange, ibin, ival, allints;
975 l_float32  minval, maxval, range, binsize, fval;
976 NUMA      *nah;
977 
978     PROCNAME("numaMakeHistogramAuto");
979 
980     if (!na)
981         return (NUMA *)ERROR_PTR("na not defined", procName, NULL);
982     maxbins = L_MAX(1, maxbins);
983 
984         /* Determine input range */
985     numaGetMin(na, &minval, NULL);
986     numaGetMax(na, &maxval, NULL);
987 
988         /* Determine if values are all integers */
989     n = numaGetCount(na);
990     numaHasOnlyIntegers(na, maxbins, &allints);
991 
992         /* Do simple integer binning if possible */
993     if (allints && (maxval - minval < maxbins)) {
994         imin = (l_int32)minval;
995         imax = (l_int32)maxval;
996         irange = imax - imin + 1;
997         nah = numaCreate(irange);
998         numaSetCount(nah, irange);  /* init */
999         numaSetParameters(nah, minval, 1.0);
1000         for (i = 0; i < n; i++) {
1001             numaGetIValue(na, i, &ival);
1002             ibin = ival - imin;
1003             numaGetIValue(nah, ibin, &ival);
1004             numaSetValue(nah, ibin, ival + 1.0);
1005         }
1006 
1007         return nah;
1008     }
1009 
1010         /* Do float binning, even if the data is integers. */
1011     range = maxval - minval;
1012     binsize = range / (l_float32)maxbins;
1013     if (range == 0.0) {
1014         nah = numaCreate(1);
1015         numaSetParameters(nah, minval, binsize);
1016         numaAddNumber(nah, n);
1017         return nah;
1018     }
1019     nah = numaCreate(maxbins);
1020     numaSetCount(nah, maxbins);
1021     numaSetParameters(nah, minval, binsize);
1022     for (i = 0; i < n; i++) {
1023         numaGetFValue(na, i, &fval);
1024         ibin = (l_int32)((fval - minval) / binsize);
1025         ibin = L_MIN(ibin, maxbins - 1);  /* "edge" case; stay in bounds */
1026         numaGetIValue(nah, ibin, &ival);
1027         numaSetValue(nah, ibin, ival + 1.0);
1028     }
1029 
1030     return nah;
1031 }
1032 
1033 
1034 /*!
1035  * \brief   numaMakeHistogramClipped()
1036  *
1037  * \param[in]    na
1038  * \param[in]    binsize typically 1.0
1039  * \param[in]    maxsize of histogram ordinate
1040  * \return  na histogram of bins of size %binsize, starting with
1041  *                  the na[0] (x = 0.0 and going up to a maximum of
1042  *                  x = %maxsize, by increments of %binsize), or NULL on error
1043  *
1044  * <pre>
1045  * Notes:
1046  *      (1) This simple function generates a histogram of values
1047  *          from na, discarding all values < 0.0 or greater than
1048  *          min(%maxsize, maxval), where maxval is the maximum value in na.
1049  *          The histogram data is put in bins of size delx = %binsize,
1050  *          starting at x = 0.0.  We use as many bins as are
1051  *          needed to hold the data.
1052  * </pre>
1053  */
1054 NUMA *
numaMakeHistogramClipped(NUMA * na,l_float32 binsize,l_float32 maxsize)1055 numaMakeHistogramClipped(NUMA      *na,
1056                          l_float32  binsize,
1057                          l_float32  maxsize)
1058 {
1059 l_int32    i, n, nbins, ival, ibin;
1060 l_float32  val, maxval;
1061 NUMA      *nad;
1062 
1063     PROCNAME("numaMakeHistogramClipped");
1064 
1065     if (!na)
1066         return (NUMA *)ERROR_PTR("na not defined", procName, NULL);
1067     if (binsize <= 0.0)
1068         return (NUMA *)ERROR_PTR("binsize must be > 0.0", procName, NULL);
1069     if (binsize > maxsize)
1070         binsize = maxsize;  /* just one bin */
1071 
1072     numaGetMax(na, &maxval, NULL);
1073     n = numaGetCount(na);
1074     maxsize = L_MIN(maxsize, maxval);
1075     nbins = (l_int32)(maxsize / binsize) + 1;
1076 
1077 /*    fprintf(stderr, "maxsize = %7.3f, nbins = %d\n", maxsize, nbins); */
1078 
1079     if ((nad = numaCreate(nbins)) == NULL)
1080         return (NUMA *)ERROR_PTR("nad not made", procName, NULL);
1081     numaSetParameters(nad, 0.0, binsize);
1082     numaSetCount(nad, nbins);  /* interpret zeroes in bins as data */
1083     for (i = 0; i < n; i++) {
1084         numaGetFValue(na, i, &val);
1085         ibin = (l_int32)(val / binsize);
1086         if (ibin >= 0 && ibin < nbins) {
1087             numaGetIValue(nad, ibin, &ival);
1088             numaSetValue(nad, ibin, ival + 1.0);
1089         }
1090     }
1091 
1092     return nad;
1093 }
1094 
1095 
1096 /*!
1097  * \brief   numaRebinHistogram()
1098  *
1099  * \param[in]    nas input histogram
1100  * \param[in]    newsize number of old bins contained in each new bin
1101  * \return  nad more coarsely re-binned histogram, or NULL on error
1102  */
1103 NUMA *
numaRebinHistogram(NUMA * nas,l_int32 newsize)1104 numaRebinHistogram(NUMA    *nas,
1105                    l_int32  newsize)
1106 {
1107 l_int32    i, j, ns, nd, index, count, val;
1108 l_float32  start, oldsize;
1109 NUMA      *nad;
1110 
1111     PROCNAME("numaRebinHistogram");
1112 
1113     if (!nas)
1114         return (NUMA *)ERROR_PTR("nas not defined", procName, NULL);
1115     if (newsize <= 1)
1116         return (NUMA *)ERROR_PTR("newsize must be > 1", procName, NULL);
1117     if ((ns = numaGetCount(nas)) == 0)
1118         return (NUMA *)ERROR_PTR("no bins in nas", procName, NULL);
1119 
1120     nd = (ns + newsize - 1) / newsize;
1121     if ((nad = numaCreate(nd)) == NULL)
1122         return (NUMA *)ERROR_PTR("nad not made", procName, NULL);
1123     numaGetParameters(nad, &start, &oldsize);
1124     numaSetParameters(nad, start, oldsize * newsize);
1125 
1126     for (i = 0; i < nd; i++) {  /* new bins */
1127         count = 0;
1128         index = i * newsize;
1129         for (j = 0; j < newsize; j++) {
1130             if (index < ns) {
1131                 numaGetIValue(nas, index, &val);
1132                 count += val;
1133                 index++;
1134             }
1135         }
1136         numaAddNumber(nad, count);
1137     }
1138 
1139     return nad;
1140 }
1141 
1142 
1143 /*!
1144  * \brief   numaNormalizeHistogram()
1145  *
1146  * \param[in]    nas input histogram
1147  * \param[in]    tsum target sum of all numbers in dest histogram;
1148  *                    e.g., use %tsum= 1.0 if this represents a
1149  *                    probability distribution
1150  * \return  nad normalized histogram, or NULL on error
1151  */
1152 NUMA *
numaNormalizeHistogram(NUMA * nas,l_float32 tsum)1153 numaNormalizeHistogram(NUMA      *nas,
1154                        l_float32  tsum)
1155 {
1156 l_int32    i, ns;
1157 l_float32  sum, factor, fval;
1158 NUMA      *nad;
1159 
1160     PROCNAME("numaNormalizeHistogram");
1161 
1162     if (!nas)
1163         return (NUMA *)ERROR_PTR("nas not defined", procName, NULL);
1164     if (tsum <= 0.0)
1165         return (NUMA *)ERROR_PTR("tsum must be > 0.0", procName, NULL);
1166     if ((ns = numaGetCount(nas)) == 0)
1167         return (NUMA *)ERROR_PTR("no bins in nas", procName, NULL);
1168 
1169     numaGetSum(nas, &sum);
1170     factor = tsum / sum;
1171 
1172     if ((nad = numaCreate(ns)) == NULL)
1173         return (NUMA *)ERROR_PTR("nad not made", procName, NULL);
1174     numaCopyParameters(nad, nas);
1175 
1176     for (i = 0; i < ns; i++) {
1177         numaGetFValue(nas, i, &fval);
1178         fval *= factor;
1179         numaAddNumber(nad, fval);
1180     }
1181 
1182     return nad;
1183 }
1184 
1185 
1186 /*!
1187  * \brief   numaGetStatsUsingHistogram()
1188  *
1189  * \param[in]    na an arbitrary set of numbers; not ordered and not
1190  *                  a histogram
1191  * \param[in]    maxbins the maximum number of bins to be allowed in
1192  *                       the histogram; use an integer larger than the
1193  *                       largest number in %na for consecutive integer bins
1194  * \param[out]   pmin [optional] min value of set
1195  * \param[out]   pmax [optional] max value of set
1196  * \param[out]   pmean [optional] mean value of set
1197  * \param[out]   pvariance [optional] variance
1198  * \param[out]   pmedian [optional] median value of set
1199  * \param[in]    rank in [0.0 ... 1.0]; median has a rank 0.5; ignored
1200  *                    if &rval == NULL
1201  * \param[out]   prval [optional] value in na corresponding to %rank
1202  * \param[out]   phisto [optional] Numa histogram; use NULL to prevent
1203  * \return  0 if OK, 1 on error
1204  *
1205  * <pre>
1206  * Notes:
1207  *      (1) This is a simple interface for gathering statistics
1208  *          from a numa, where a histogram is used 'under the covers'
1209  *          to avoid sorting if a rank value is requested.  In that case,
1210  *          by using a histogram we are trading speed for accuracy, because
1211  *          the values in %na are quantized to the center of a set of bins.
1212  *      (2) If the median, other rank value, or histogram are not requested,
1213  *          the calculation is all performed on the input Numa.
1214  *      (3) The variance is the average of the square of the
1215  *          difference from the mean.  The median is the value in na
1216  *          with rank 0.5.
1217  *      (4) There are two situations where this gives rank results with
1218  *          accuracy comparable to computing stastics directly on the input
1219  *          data, without binning into a histogram:
1220  *           (a) the data is integers and the range of data is less than
1221  *               %maxbins, and
1222  *           (b) the data is floats and the range is small compared to
1223  *               %maxbins, so that the binsize is much less than 1.
1224  *      (5) If a histogram is used and the numbers in the Numa extend
1225  *          over a large range, you can limit the required storage by
1226  *          specifying the maximum number of bins in the histogram.
1227  *          Use %maxbins == 0 to force the bin size to be 1.
1228  *      (6) This optionally returns the median and one arbitrary rank value.
1229  *          If you need several rank values, return the histogram and use
1230  *               numaHistogramGetValFromRank(nah, rank, &rval)
1231  *          multiple times.
1232  * </pre>
1233  */
1234 l_int32
numaGetStatsUsingHistogram(NUMA * na,l_int32 maxbins,l_float32 * pmin,l_float32 * pmax,l_float32 * pmean,l_float32 * pvariance,l_float32 * pmedian,l_float32 rank,l_float32 * prval,NUMA ** phisto)1235 numaGetStatsUsingHistogram(NUMA       *na,
1236                            l_int32     maxbins,
1237                            l_float32  *pmin,
1238                            l_float32  *pmax,
1239                            l_float32  *pmean,
1240                            l_float32  *pvariance,
1241                            l_float32  *pmedian,
1242                            l_float32   rank,
1243                            l_float32  *prval,
1244                            NUMA      **phisto)
1245 {
1246 l_int32    i, n;
1247 l_float32  minval, maxval, fval, mean, sum;
1248 NUMA      *nah;
1249 
1250     PROCNAME("numaGetStatsUsingHistogram");
1251 
1252     if (pmin) *pmin = 0.0;
1253     if (pmax) *pmax = 0.0;
1254     if (pmean) *pmean = 0.0;
1255     if (pvariance) *pvariance = 0.0;
1256     if (pmedian) *pmedian = 0.0;
1257     if (prval) *prval = 0.0;
1258     if (phisto) *phisto = NULL;
1259     if (!na)
1260         return ERROR_INT("na not defined", procName, 1);
1261     if ((n = numaGetCount(na)) == 0)
1262         return ERROR_INT("numa is empty", procName, 1);
1263 
1264     numaGetMin(na, &minval, NULL);
1265     numaGetMax(na, &maxval, NULL);
1266     if (pmin) *pmin = minval;
1267     if (pmax) *pmax = maxval;
1268     if (pmean || pvariance) {
1269         sum = 0.0;
1270         for (i = 0; i < n; i++) {
1271             numaGetFValue(na, i, &fval);
1272             sum += fval;
1273         }
1274         mean = sum / (l_float32)n;
1275         if (pmean) *pmean = mean;
1276     }
1277     if (pvariance) {
1278         sum = 0.0;
1279         for (i = 0; i < n; i++) {
1280             numaGetFValue(na, i, &fval);
1281             sum += fval * fval;
1282         }
1283         *pvariance = sum / (l_float32)n - mean * mean;
1284     }
1285 
1286     if (!pmedian && !prval && !phisto)
1287         return 0;
1288 
1289     nah = numaMakeHistogramAuto(na, maxbins);
1290     if (pmedian)
1291         numaHistogramGetValFromRank(nah, 0.5, pmedian);
1292     if (prval)
1293         numaHistogramGetValFromRank(nah, rank, prval);
1294     if (phisto)
1295         *phisto = nah;
1296     else
1297         numaDestroy(&nah);
1298     return 0;
1299 }
1300 
1301 
1302 /*!
1303  * \brief   numaGetHistogramStats()
1304  *
1305  * \param[in]    nahisto histogram: y(x(i)), i = 0 ... nbins - 1
1306  * \param[in]    startx x value of first bin: x(0)
1307  * \param[in]    deltax x increment between bins; the bin size; x(1) - x(0)
1308  * \param[out]   pxmean [optional] mean value of histogram
1309  * \param[out]   pxmedian [optional] median value of histogram
1310  * \param[out]   pxmode [optional] mode value of histogram:
1311  *                      xmode = x(imode), where y(xmode) >= y(x(i)) for
1312  *                      all i != imode
1313  * \param[out]   pxvariance [optional] variance of x
1314  * \return  0 if OK, 1 on error
1315  *
1316  * <pre>
1317  * Notes:
1318  *      (1) If the histogram represents the relation y(x), the
1319  *          computed values that are returned are the x values.
1320  *          These are NOT the bucket indices i; they are related to the
1321  *          bucket indices by
1322  *                x(i) = startx + i * deltax
1323  * </pre>
1324  */
1325 l_int32
numaGetHistogramStats(NUMA * nahisto,l_float32 startx,l_float32 deltax,l_float32 * pxmean,l_float32 * pxmedian,l_float32 * pxmode,l_float32 * pxvariance)1326 numaGetHistogramStats(NUMA       *nahisto,
1327                       l_float32   startx,
1328                       l_float32   deltax,
1329                       l_float32  *pxmean,
1330                       l_float32  *pxmedian,
1331                       l_float32  *pxmode,
1332                       l_float32  *pxvariance)
1333 {
1334     PROCNAME("numaGetHistogramStats");
1335 
1336     if (pxmean) *pxmean = 0.0;
1337     if (pxmedian) *pxmedian = 0.0;
1338     if (pxmode) *pxmode = 0.0;
1339     if (pxvariance) *pxvariance = 0.0;
1340     if (!nahisto)
1341         return ERROR_INT("nahisto not defined", procName, 1);
1342 
1343     return numaGetHistogramStatsOnInterval(nahisto, startx, deltax, 0, 0,
1344                                            pxmean, pxmedian, pxmode,
1345                                            pxvariance);
1346 }
1347 
1348 
1349 /*!
1350  * \brief   numaGetHistogramStatsOnInterval()
1351  *
1352  * \param[in]    nahisto histogram: y(x(i)), i = 0 ... nbins - 1
1353  * \param[in]    startx x value of first bin: x(0)
1354  * \param[in]    deltax x increment between bins; the bin size; x(1) - x(0)
1355  * \param[in]    ifirst first bin to use for collecting stats
1356  * \param[in]    ilast last bin for collecting stats; use 0 to go to the end
1357  * \param[out]   pxmean [optional] mean value of histogram
1358  * \param[out]   pxmedian [optional] median value of histogram
1359  * \param[out]   pxmode [optional] mode value of histogram:
1360  *                      xmode = x(imode), where y(xmode) >= y(x(i)) for
1361  *                      all i != imode
1362  * \param[out]   pxvariance [optional] variance of x
1363  * \return  0 if OK, 1 on error
1364  *
1365  * <pre>
1366  * Notes:
1367  *      (1) If the histogram represents the relation y(x), the
1368  *          computed values that are returned are the x values.
1369  *          These are NOT the bucket indices i; they are related to the
1370  *          bucket indices by
1371  *                x(i) = startx + i * deltax
1372  * </pre>
1373  */
1374 l_int32
numaGetHistogramStatsOnInterval(NUMA * nahisto,l_float32 startx,l_float32 deltax,l_int32 ifirst,l_int32 ilast,l_float32 * pxmean,l_float32 * pxmedian,l_float32 * pxmode,l_float32 * pxvariance)1375 numaGetHistogramStatsOnInterval(NUMA       *nahisto,
1376                                 l_float32   startx,
1377                                 l_float32   deltax,
1378                                 l_int32     ifirst,
1379                                 l_int32     ilast,
1380                                 l_float32  *pxmean,
1381                                 l_float32  *pxmedian,
1382                                 l_float32  *pxmode,
1383                                 l_float32  *pxvariance)
1384 {
1385 l_int32    i, n, imax;
1386 l_float32  sum, sumval, halfsum, moment, var, x, y, ymax;
1387 
1388     PROCNAME("numaGetHistogramStatsOnInterval");
1389 
1390     if (pxmean) *pxmean = 0.0;
1391     if (pxmedian) *pxmedian = 0.0;
1392     if (pxmode) *pxmode = 0.0;
1393     if (pxvariance) *pxvariance = 0.0;
1394     if (!nahisto)
1395         return ERROR_INT("nahisto not defined", procName, 1);
1396     if (!pxmean && !pxmedian && !pxmode && !pxvariance)
1397         return ERROR_INT("nothing to compute", procName, 1);
1398 
1399     n = numaGetCount(nahisto);
1400     if (ilast <= 0) ilast = n - 1;
1401     if (ifirst < 0) ifirst = 0;
1402     if (ifirst > ilast || ifirst > n - 1)
1403         return ERROR_INT("ifirst is too large", procName, 1);
1404     for (sum = 0.0, moment = 0.0, var = 0.0, i = ifirst; i <= ilast ; i++) {
1405         x = startx + i * deltax;
1406         numaGetFValue(nahisto, i, &y);
1407         sum += y;
1408         moment += x * y;
1409         var += x * x * y;
1410     }
1411     if (sum == 0.0) {
1412         L_INFO("sum is 0\n", procName);
1413         return 0;
1414     }
1415 
1416     if (pxmean)
1417         *pxmean = moment / sum;
1418     if (pxvariance)
1419         *pxvariance = var / sum - moment * moment / (sum * sum);
1420 
1421     if (pxmedian) {
1422         halfsum = sum / 2.0;
1423         for (sumval = 0.0, i = ifirst; i <= ilast; i++) {
1424             numaGetFValue(nahisto, i, &y);
1425             sumval += y;
1426             if (sumval >= halfsum) {
1427                 *pxmedian = startx + i * deltax;
1428                 break;
1429             }
1430         }
1431     }
1432 
1433     if (pxmode) {
1434         imax = -1;
1435         ymax = -1.0e10;
1436         for (i = ifirst; i <= ilast; i++) {
1437             numaGetFValue(nahisto, i, &y);
1438             if (y > ymax) {
1439                 ymax = y;
1440                 imax = i;
1441             }
1442         }
1443         *pxmode = startx + imax * deltax;
1444     }
1445 
1446     return 0;
1447 }
1448 
1449 
1450 /*!
1451  * \brief   numaMakeRankFromHistogram()
1452  *
1453  * \param[in]    startx xval corresponding to first element in nay
1454  * \param[in]    deltax x increment between array elements in nay
1455  * \param[in]    nasy input histogram, assumed equally spaced
1456  * \param[in]    npts number of points to evaluate rank function
1457  * \param[out]   pnax [optional] array of x values in range
1458  * \param[out]   pnay rank array of specified npts
1459  * \return  0 if OK, 1 on error
1460  */
1461 l_int32
numaMakeRankFromHistogram(l_float32 startx,l_float32 deltax,NUMA * nasy,l_int32 npts,NUMA ** pnax,NUMA ** pnay)1462 numaMakeRankFromHistogram(l_float32  startx,
1463                           l_float32  deltax,
1464                           NUMA      *nasy,
1465                           l_int32    npts,
1466                           NUMA     **pnax,
1467                           NUMA     **pnay)
1468 {
1469 l_int32    i, n;
1470 l_float32  sum, fval;
1471 NUMA      *nan, *nar;
1472 
1473     PROCNAME("numaMakeRankFromHistogram");
1474 
1475     if (pnax) *pnax = NULL;
1476     if (!pnay)
1477         return ERROR_INT("&nay not defined", procName, 1);
1478     *pnay = NULL;
1479     if (!nasy)
1480         return ERROR_INT("nasy not defined", procName, 1);
1481     if ((n = numaGetCount(nasy)) == 0)
1482         return ERROR_INT("no bins in nas", procName, 1);
1483 
1484         /* Normalize and generate the rank array corresponding to
1485          * the binned histogram. */
1486     nan = numaNormalizeHistogram(nasy, 1.0);
1487     nar = numaCreate(n + 1);  /* rank numa corresponding to nan */
1488     sum = 0.0;
1489     numaAddNumber(nar, sum);  /* first element is 0.0 */
1490     for (i = 0; i < n; i++) {
1491         numaGetFValue(nan, i, &fval);
1492         sum += fval;
1493         numaAddNumber(nar, sum);
1494     }
1495 
1496         /* Compute rank array on full range with specified
1497          * number of points and correspondence to x-values. */
1498     numaInterpolateEqxInterval(startx, deltax, nar, L_LINEAR_INTERP,
1499                                startx, startx + n * deltax, npts,
1500                                pnax, pnay);
1501     numaDestroy(&nan);
1502     numaDestroy(&nar);
1503     return 0;
1504 }
1505 
1506 
1507 /*!
1508  * \brief   numaHistogramGetRankFromVal()
1509  *
1510  * \param[in]    na histogram
1511  * \param[in]    rval value of input sample for which we want the rank
1512  * \param[out]   prank fraction of total samples below rval
1513  * \return  0 if OK, 1 on error
1514  *
1515  * <pre>
1516  * Notes:
1517  *      (1) If we think of the histogram as a function y(x), normalized
1518  *          to 1, for a given input value of x, this computes the
1519  *          rank of x, which is the integral of y(x) from the start
1520  *          value of x to the input value.
1521  *      (2) This function only makes sense when applied to a Numa that
1522  *          is a histogram.  The values in the histogram can be ints and
1523  *          floats, and are computed as floats.  The rank is returned
1524  *          as a float between 0.0 and 1.0.
1525  *      (3) The numa parameters startx and binsize are used to
1526  *          compute x from the Numa index i.
1527  * </pre>
1528  */
1529 l_int32
numaHistogramGetRankFromVal(NUMA * na,l_float32 rval,l_float32 * prank)1530 numaHistogramGetRankFromVal(NUMA       *na,
1531                             l_float32   rval,
1532                             l_float32  *prank)
1533 {
1534 l_int32    i, ibinval, n;
1535 l_float32  startval, binsize, binval, maxval, fractval, total, sum, val;
1536 
1537     PROCNAME("numaHistogramGetRankFromVal");
1538 
1539     if (!prank)
1540         return ERROR_INT("prank not defined", procName, 1);
1541     *prank = 0.0;
1542     if (!na)
1543         return ERROR_INT("na not defined", procName, 1);
1544     numaGetParameters(na, &startval, &binsize);
1545     n = numaGetCount(na);
1546     if (rval < startval)
1547         return 0;
1548     maxval = startval + n * binsize;
1549     if (rval > maxval) {
1550         *prank = 1.0;
1551         return 0;
1552     }
1553 
1554     binval = (rval - startval) / binsize;
1555     ibinval = (l_int32)binval;
1556     if (ibinval >= n) {
1557         *prank = 1.0;
1558         return 0;
1559     }
1560     fractval = binval - (l_float32)ibinval;
1561 
1562     sum = 0.0;
1563     for (i = 0; i < ibinval; i++) {
1564         numaGetFValue(na, i, &val);
1565         sum += val;
1566     }
1567     numaGetFValue(na, ibinval, &val);
1568     sum += fractval * val;
1569     numaGetSum(na, &total);
1570     *prank = sum / total;
1571 
1572 /*    fprintf(stderr, "binval = %7.3f, rank = %7.3f\n", binval, *prank); */
1573 
1574     return 0;
1575 }
1576 
1577 
1578 /*!
1579  * \brief   numaHistogramGetValFromRank()
1580  *
1581  * \param[in]    na histogram
1582  * \param[in]    rank fraction of total samples
1583  * \param[out]   prval approx. to the bin value
1584  * \return  0 if OK, 1 on error
1585  *
1586  * <pre>
1587  * Notes:
1588  *      (1) If we think of the histogram as a function y(x), this returns
1589  *          the value x such that the integral of y(x) from the start
1590  *          value to x gives the fraction 'rank' of the integral
1591  *          of y(x) over all bins.
1592  *      (2) This function only makes sense when applied to a Numa that
1593  *          is a histogram.  The values in the histogram can be ints and
1594  *          floats, and are computed as floats.  The val is returned
1595  *          as a float, even though the buckets are of integer width.
1596  *      (3) The numa parameters startx and binsize are used to
1597  *          compute x from the Numa index i.
1598  * </pre>
1599  */
1600 l_int32
numaHistogramGetValFromRank(NUMA * na,l_float32 rank,l_float32 * prval)1601 numaHistogramGetValFromRank(NUMA       *na,
1602                             l_float32   rank,
1603                             l_float32  *prval)
1604 {
1605 l_int32    i, n;
1606 l_float32  startval, binsize, rankcount, total, sum, fract, val;
1607 
1608     PROCNAME("numaHistogramGetValFromRank");
1609 
1610     if (!prval)
1611         return ERROR_INT("prval not defined", procName, 1);
1612     *prval = 0.0;
1613     if (!na)
1614         return ERROR_INT("na not defined", procName, 1);
1615     if (rank < 0.0) {
1616         L_WARNING("rank < 0; setting to 0.0\n", procName);
1617         rank = 0.0;
1618     }
1619     if (rank > 1.0) {
1620         L_WARNING("rank > 1.0; setting to 1.0\n", procName);
1621         rank = 1.0;
1622     }
1623 
1624     n = numaGetCount(na);
1625     numaGetParameters(na, &startval, &binsize);
1626     numaGetSum(na, &total);
1627     rankcount = rank * total;  /* count that corresponds to rank */
1628     sum = 0.0;
1629     for (i = 0; i < n; i++) {
1630         numaGetFValue(na, i, &val);
1631         if (sum + val >= rankcount)
1632             break;
1633         sum += val;
1634     }
1635     if (val <= 0.0)  /* can == 0 if rank == 0.0 */
1636         fract = 0.0;
1637     else  /* sum + fract * val = rankcount */
1638         fract = (rankcount - sum) / val;
1639 
1640     /* The use of the fraction of a bin allows a simple calculation
1641      * for the histogram value at the given rank. */
1642     *prval = startval + binsize * ((l_float32)i + fract);
1643 
1644 /*    fprintf(stderr, "rank = %7.3f, val = %7.3f\n", rank, *prval); */
1645 
1646     return 0;
1647 }
1648 
1649 
1650 /*!
1651  * \brief   numaDiscretizeRankAndIntensity()
1652  *
1653  * \param[in]    na normalized histogram of probability density vs intensity
1654  * \param[in]    nbins number of bins at which the rank is divided
1655  * \param[out]   pnarbin [optional] rank bin value vs intensity
1656  * \param[out]   pnam [optional] median intensity in a bin vs
1657  *                     rank bin value, with %nbins of discretized rank values
1658  * \param[out]   pnar [optional] rank vs intensity; this is
1659  *                     a cumulative norm histogram
1660  * \param[out]   pnabb [optional] intensity at the right bin boundary
1661  *                      vs rank bin
1662  * \return  0 if OK, 1 on error
1663  *
1664  * <pre>
1665  * Notes:
1666  *      (1) We are inverting the rank(intensity) function to get
1667  *          the intensity(rank) function at %nbins equally spaced
1668  *          values of rank between 0.0 and 1.0.  We save integer values
1669  *          for the intensity.
1670  *      (2) We are using the word "intensity" to describe the type of
1671  *          array values, but any array of non-negative numbers will work.
1672  *      (3) The output arrays give the following mappings, where the
1673  *          input is a normalized histogram of array values:
1674  *             array values     -->  rank bin number  (narbin)
1675  *             rank bin number  -->  median array value in bin (nam)
1676  *             array values     -->  cumulative norm = rank  (nar)
1677  *             rank bin number  -->  array value at right bin edge (nabb)
1678  * </pre>
1679  */
1680 l_int32
numaDiscretizeRankAndIntensity(NUMA * na,l_int32 nbins,NUMA ** pnarbin,NUMA ** pnam,NUMA ** pnar,NUMA ** pnabb)1681 numaDiscretizeRankAndIntensity(NUMA    *na,
1682                                l_int32  nbins,
1683                                NUMA   **pnarbin,
1684                                NUMA   **pnam,
1685                                NUMA   **pnar,
1686                                NUMA   **pnabb)
1687 {
1688 NUMA      *nar;  /* rank value as function of intensity */
1689 NUMA      *nam;  /* median intensity in the rank bins */
1690 NUMA      *nabb;  /* rank bin right boundaries (in intensity) */
1691 NUMA      *narbin;  /* binned rank value as a function of intensity */
1692 l_int32    i, j, npts, start, midfound, mcount, rightedge;
1693 l_float32  sum, midrank, endrank, val;
1694 
1695     PROCNAME("numaDiscretizeRankAndIntensity");
1696 
1697     if (pnarbin) *pnarbin = NULL;
1698     if (pnam) *pnam = NULL;
1699     if (pnar) *pnar = NULL;
1700     if (pnabb) *pnabb = NULL;
1701     if (!pnarbin && !pnam && !pnar && !pnabb)
1702         return ERROR_INT("no output requested", procName, 1);
1703     if (!na)
1704         return ERROR_INT("na not defined", procName, 1);
1705     if (nbins < 2)
1706         return ERROR_INT("nbins must be > 1", procName, 1);
1707 
1708         /* Get cumulative normalized histogram (rank vs intensity value).
1709          * For a normalized histogram from an 8 bpp grayscale image
1710          * as input, we have 256 bins and 257 points in the
1711          * cumulative (rank) histogram. */
1712     npts = numaGetCount(na);
1713     if ((nar = numaCreate(npts + 1)) == NULL)
1714         return ERROR_INT("nar not made", procName, 1);
1715     sum = 0.0;
1716     numaAddNumber(nar, sum);  /* left side of first bin */
1717     for (i = 0; i < npts; i++) {
1718         numaGetFValue(na, i, &val);
1719         sum += val;
1720         numaAddNumber(nar, sum);
1721     }
1722 
1723     nam = numaCreate(nbins);
1724     narbin = numaCreate(npts);
1725     nabb = numaCreate(nbins);
1726     if (!nam || !narbin || !nabb) {
1727         numaDestroy(&nar);
1728         numaDestroy(&nam);
1729         numaDestroy(&narbin);
1730         numaDestroy(&nabb);
1731         return ERROR_INT("numa not made", procName, 1);
1732     }
1733 
1734         /* We find the intensity value at the right edge of each of
1735          * the rank bins.  We also find the median intensity in the bin,
1736          * where approximately half the samples are lower and half are
1737          * higher.  This can be considered as a simple approximation
1738          * for the average intensity in the bin. */
1739     start = 0;  /* index in nar */
1740     mcount = 0;  /* count of median values in rank bins; not to exceed nbins */
1741     for (i = 0; i < nbins; i++) {
1742         midrank = (l_float32)(i + 0.5) / (l_float32)(nbins);
1743         endrank = (l_float32)(i + 1.0) / (l_float32)(nbins);
1744         endrank = L_MAX(0.0, L_MIN(endrank - 0.001, 1.0));
1745         midfound = FALSE;
1746         for (j = start; j < npts; j++) {  /* scan up for each bin value */
1747             numaGetFValue(nar, j, &val);
1748                 /* Use (j == npts - 1) tests in case all weight is at top end */
1749             if ((!midfound && val >= midrank) ||
1750                 (mcount < nbins && j == npts - 1)) {
1751                 midfound = TRUE;
1752                 numaAddNumber(nam, j);
1753                 mcount++;
1754             }
1755             if ((val >= endrank) || (j == npts - 1)) {
1756                 numaAddNumber(nabb, j);
1757                 if (val == endrank)
1758                     start = j;
1759                 else
1760                     start = j - 1;
1761                 break;
1762             }
1763         }
1764     }
1765     numaSetValue(nabb, nbins - 1, npts - 1);  /* extend to max */
1766 
1767         /* Error checking: did we get data in all bins? */
1768     if (mcount != nbins)
1769         L_WARNING("found data for %d bins; should be %d\n",
1770                   procName, mcount, nbins);
1771 
1772         /* Generate LUT that maps from intensity to bin number */
1773     start = 0;
1774     for (i = 0; i < nbins; i++) {
1775         numaGetIValue(nabb, i, &rightedge);
1776         for (j = start; j < npts; j++) {
1777             if (j <= rightedge)
1778                 numaAddNumber(narbin, i);
1779             if (j > rightedge) {
1780                 start = j;
1781                 break;
1782             }
1783             if (j == npts - 1) {  /* we're done */
1784                 start = j + 1;
1785                 break;
1786             }
1787         }
1788     }
1789 
1790     if (pnarbin)
1791         *pnarbin = narbin;
1792     else
1793         numaDestroy(&narbin);
1794     if (pnam)
1795         *pnam = nam;
1796     else
1797         numaDestroy(&nam);
1798     if (pnar)
1799         *pnar = nar;
1800     else
1801         numaDestroy(&nar);
1802     if (pnabb)
1803         *pnabb = nabb;
1804     else
1805         numaDestroy(&nabb);
1806     return 0;
1807 }
1808 
1809 
1810 /*!
1811  * \brief   numaGetRankBinValues()
1812  *
1813  * \param[in]    na just an array of values
1814  * \param[in]    nbins number of bins at which the rank is divided
1815  * \param[out]   pnarbin [optional] rank bin value vs array value
1816  * \param[out]   pnam [optional] median intensity in a bin vs
1817  *                     rank bin value, with %nbins of discretized rank values
1818  * \return  0 if OK, 1 on error
1819  *
1820  * <pre>
1821  * Notes:
1822  *      (1) Simple interface for getting a binned rank representation
1823  *          of an input array of values.  This returns two mappings:
1824  *             array value     -->  rank bin number  (narbin)
1825  *             rank bin number -->  median array value in each rank bin (nam)
1826  * </pre>
1827  */
1828 l_int32
numaGetRankBinValues(NUMA * na,l_int32 nbins,NUMA ** pnarbin,NUMA ** pnam)1829 numaGetRankBinValues(NUMA    *na,
1830                      l_int32  nbins,
1831                      NUMA   **pnarbin,
1832                      NUMA   **pnam)
1833 {
1834 NUMA      *nah, *nan;  /* histo and normalized histo */
1835 l_int32    maxbins, discardval;
1836 l_float32  maxval, delx;
1837 
1838     PROCNAME("numaGetRankBinValues");
1839 
1840     if (pnarbin) *pnarbin = NULL;
1841     if (pnam) *pnam = NULL;
1842     if (!pnarbin && !pnam)
1843         return ERROR_INT("no output requested", procName, 1);
1844     if (!na)
1845         return ERROR_INT("na not defined", procName, 1);
1846     if (numaGetCount(na) == 0)
1847         return ERROR_INT("na is empty", procName, 1);
1848     if (nbins < 2)
1849         return ERROR_INT("nbins must be > 1", procName, 1);
1850 
1851         /* Get normalized histogram  */
1852     numaGetMax(na, &maxval, NULL);
1853     maxbins = L_MIN(100002, (l_int32)maxval + 2);
1854     nah = numaMakeHistogram(na, maxbins, &discardval, NULL);
1855     nan = numaNormalizeHistogram(nah, 1.0);
1856 
1857         /* Warn if there is a scale change.  This shouldn't happen
1858          * unless the max value is above 100000.  */
1859     numaGetParameters(nan, NULL, &delx);
1860     if (delx > 1.0)
1861         L_WARNING("scale change: delx = %6.2f\n", procName, delx);
1862 
1863         /* Rank bin the results */
1864     numaDiscretizeRankAndIntensity(nan, nbins, pnarbin, pnam, NULL, NULL);
1865     numaDestroy(&nah);
1866     numaDestroy(&nan);
1867     return 0;
1868 }
1869 
1870 
1871 /*----------------------------------------------------------------------*
1872  *                      Splitting a distribution                        *
1873  *----------------------------------------------------------------------*/
1874 /*!
1875  * \brief   numaSplitDistribution()
1876  *
1877  * \param[in]    na histogram
1878  * \param[in]    scorefract fraction of the max score, used to determine
1879  *                          the range over which the histogram min is searched
1880  * \param[out]   psplitindex [optional] index for splitting
1881  * \param[out]   pave1 [optional] average of lower distribution
1882  * \param[out]   pave2 [optional] average of upper distribution
1883  * \param[out]   pnum1 [optional] population of lower distribution
1884  * \param[out]   pnum2 [optional] population of upper distribution
1885  * \param[out]   pnascore [optional] for debugging; otherwise use NULL
1886  * \return  0 if OK, 1 on error
1887  *
1888  * <pre>
1889  * Notes:
1890  *      (1) This function is intended to be used on a distribution of
1891  *          values that represent two sets, such as a histogram of
1892  *          pixel values for an image with a fg and bg, and the goal
1893  *          is to determine the averages of the two sets and the
1894  *          best splitting point.
1895  *      (2) The Otsu method finds a split point that divides the distribution
1896  *          into two parts by maximizing a score function that is the
1897  *          product of two terms:
1898  *            (a) the square of the difference of centroids, (ave1 - ave2)^2
1899  *            (b) fract1 * (1 - fract1)
1900  *          where fract1 is the fraction in the lower distribution.
1901  *      (3) This works well for images where the fg and bg are
1902  *          each relatively homogeneous and well-separated in color.
1903  *          However, if the actual fg and bg sets are very different
1904  *          in size, and the bg is highly varied, as can occur in some
1905  *          scanned document images, this will bias the split point
1906  *          into the larger "bump" (i.e., toward the point where the
1907  *          (b) term reaches its maximum of 0.25 at fract1 = 0.5.
1908  *          To avoid this, we define a range of values near the
1909  *          maximum of the score function, and choose the value within
1910  *          this range such that the histogram itself has a minimum value.
1911  *          The range is determined by scorefract: we include all abscissa
1912  *          values to the left and right of the value that maximizes the
1913  *          score, such that the score stays above (1 - scorefract) * maxscore.
1914  *          The intuition behind this modification is to try to find
1915  *          a split point that both has a high variance score and is
1916  *          at or near a minimum in the histogram, so that the histogram
1917  *          slope is small at the split point.
1918  *      (4) We normalize the score so that if the two distributions
1919  *          were of equal size and at opposite ends of the numa, the
1920  *          score would be 1.0.
1921  * </pre>
1922  */
1923 l_int32
numaSplitDistribution(NUMA * na,l_float32 scorefract,l_int32 * psplitindex,l_float32 * pave1,l_float32 * pave2,l_float32 * pnum1,l_float32 * pnum2,NUMA ** pnascore)1924 numaSplitDistribution(NUMA       *na,
1925                       l_float32   scorefract,
1926                       l_int32    *psplitindex,
1927                       l_float32  *pave1,
1928                       l_float32  *pave2,
1929                       l_float32  *pnum1,
1930                       l_float32  *pnum2,
1931                       NUMA      **pnascore)
1932 {
1933 l_int32    i, n, bestsplit, minrange, maxrange, maxindex;
1934 l_float32  ave1, ave2, ave1prev, ave2prev;
1935 l_float32  num1, num2, num1prev, num2prev;
1936 l_float32  val, minval, sum, fract1;
1937 l_float32  norm, score, minscore, maxscore;
1938 NUMA      *nascore, *naave1, *naave2, *nanum1, *nanum2;
1939 
1940     PROCNAME("numaSplitDistribution");
1941 
1942     if (psplitindex) *psplitindex = 0;
1943     if (pave1) *pave1 = 0.0;
1944     if (pave2) *pave2 = 0.0;
1945     if (pnum1) *pnum1 = 0.0;
1946     if (pnum2) *pnum2 = 0.0;
1947     if (pnascore) *pnascore = NULL;
1948     if (!na)
1949         return ERROR_INT("na not defined", procName, 1);
1950 
1951     n = numaGetCount(na);
1952     if (n <= 1)
1953         return ERROR_INT("n = 1 in histogram", procName, 1);
1954     numaGetSum(na, &sum);
1955     if (sum <= 0.0)
1956         return ERROR_INT("sum <= 0.0", procName, 1);
1957     norm = 4.0 / ((l_float32)(n - 1) * (n - 1));
1958     ave1prev = 0.0;
1959     numaGetHistogramStats(na, 0.0, 1.0, &ave2prev, NULL, NULL, NULL);
1960     num1prev = 0.0;
1961     num2prev = sum;
1962     maxindex = n / 2;  /* initialize with something */
1963 
1964         /* Split the histogram with [0 ... i] in the lower part
1965          * and [i+1 ... n-1] in upper part.  First, compute an otsu
1966          * score for each possible splitting.  */
1967     if ((nascore = numaCreate(n)) == NULL)
1968         return ERROR_INT("nascore not made", procName, 1);
1969     naave1 = (pave1) ? numaCreate(n) : NULL;
1970     naave2 = (pave2) ? numaCreate(n) : NULL;
1971     nanum1 = (pnum1) ? numaCreate(n) : NULL;
1972     nanum2 = (pnum2) ? numaCreate(n) : NULL;
1973     maxscore = 0.0;
1974     for (i = 0; i < n; i++) {
1975         numaGetFValue(na, i, &val);
1976         num1 = num1prev + val;
1977         if (num1 == 0)
1978             ave1 = ave1prev;
1979         else
1980             ave1 = (num1prev * ave1prev + i * val) / num1;
1981         num2 = num2prev - val;
1982         if (num2 == 0)
1983             ave2 = ave2prev;
1984         else
1985             ave2 = (num2prev * ave2prev - i * val) / num2;
1986         fract1 = num1 / sum;
1987         score = norm * (fract1 * (1 - fract1)) * (ave2 - ave1) * (ave2 - ave1);
1988         numaAddNumber(nascore, score);
1989         if (pave1) numaAddNumber(naave1, ave1);
1990         if (pave2) numaAddNumber(naave2, ave2);
1991         if (pnum1) numaAddNumber(nanum1, num1);
1992         if (pnum2) numaAddNumber(nanum2, num2);
1993         if (score > maxscore) {
1994             maxscore = score;
1995             maxindex = i;
1996         }
1997         num1prev = num1;
1998         num2prev = num2;
1999         ave1prev = ave1;
2000         ave2prev = ave2;
2001     }
2002 
2003         /* Next, for all contiguous scores within a specified fraction
2004          * of the max, choose the split point as the value with the
2005          * minimum in the histogram. */
2006     minscore = (1. - scorefract) * maxscore;
2007     for (i = maxindex - 1; i >= 0; i--) {
2008         numaGetFValue(nascore, i, &val);
2009         if (val < minscore)
2010             break;
2011     }
2012     minrange = i + 1;
2013     for (i = maxindex + 1; i < n; i++) {
2014         numaGetFValue(nascore, i, &val);
2015         if (val < minscore)
2016             break;
2017     }
2018     maxrange = i - 1;
2019     numaGetFValue(na, minrange, &minval);
2020     bestsplit = minrange;
2021     for (i = minrange + 1; i <= maxrange; i++) {
2022         numaGetFValue(na, i, &val);
2023         if (val < minval) {
2024             minval = val;
2025             bestsplit = i;
2026         }
2027     }
2028 
2029         /* Add one to the bestsplit value to get the threshold value,
2030          * because when we take a threshold, as in pixThresholdToBinary(),
2031          * we always choose the set with values below the threshold. */
2032     bestsplit = L_MIN(255, bestsplit + 1);
2033 
2034     if (psplitindex) *psplitindex = bestsplit;
2035     if (pave1) numaGetFValue(naave1, bestsplit, pave1);
2036     if (pave2) numaGetFValue(naave2, bestsplit, pave2);
2037     if (pnum1) numaGetFValue(nanum1, bestsplit, pnum1);
2038     if (pnum2) numaGetFValue(nanum2, bestsplit, pnum2);
2039 
2040     if (pnascore) {  /* debug mode */
2041         fprintf(stderr, "minrange = %d, maxrange = %d\n", minrange, maxrange);
2042         fprintf(stderr, "minval = %10.0f\n", minval);
2043         gplotSimple1(nascore, GPLOT_PNG, "/tmp/lept/nascore",
2044                      "Score for split distribution");
2045         *pnascore = nascore;
2046     } else {
2047         numaDestroy(&nascore);
2048     }
2049 
2050     if (pave1) numaDestroy(&naave1);
2051     if (pave2) numaDestroy(&naave2);
2052     if (pnum1) numaDestroy(&nanum1);
2053     if (pnum2) numaDestroy(&nanum2);
2054     return 0;
2055 }
2056 
2057 
2058 /*----------------------------------------------------------------------*
2059  *                         Comparing histograms                         *
2060  *----------------------------------------------------------------------*/
2061 /*!
2062  * \brief   grayHistogramsToEMD()
2063  *
2064  * \param[in]    naa1, naa2 two numaa, each with one or more 256-element
2065  *                          histograms
2066  * \param[out]   pnad nad of EM distances for each histogram
2067  * \return  0 if OK, 1 on error
2068  *
2069  * <pre>
2070  * Notes:
2071  *     (1) The two numaas must be the same size and have corresponding
2072  *         256-element histograms.  Pairs do not need to be normalized
2073  *         to the same sum.
2074  *     (2) This is typically used on two sets of histograms from
2075  *         corresponding tiles of two images.  The similarity of two
2076  *         images can be found with the scoring function used in
2077  *         pixCompareGrayByHisto():
2078  *             score S = 1.0 - k * D, where
2079  *                 k is a constant, say in the range 5-10
2080  *                 D = EMD
2081  *             for each tile; for multiple tiles, take the Min(S) over
2082  *             the set of tiles to be the final score.
2083  * </pre>
2084  */
2085 l_int32
grayHistogramsToEMD(NUMAA * naa1,NUMAA * naa2,NUMA ** pnad)2086 grayHistogramsToEMD(NUMAA  *naa1,
2087                     NUMAA  *naa2,
2088                     NUMA  **pnad)
2089 {
2090 l_int32     i, n, nt;
2091 l_float32   dist;
2092 NUMA       *na1, *na2, *nad;
2093 
2094     PROCNAME("grayHistogramsToEMD");
2095 
2096     if (!pnad)
2097         return ERROR_INT("&nad not defined", procName, 1);
2098     *pnad = NULL;
2099     if (!naa1 || !naa2)
2100         return ERROR_INT("na1 and na2 not both defined", procName, 1);
2101     n = numaaGetCount(naa1);
2102     if (n != numaaGetCount(naa2))
2103         return ERROR_INT("naa1 and naa2 numa counts differ", procName, 1);
2104     nt = numaaGetNumberCount(naa1);
2105     if (nt != numaaGetNumberCount(naa2))
2106         return ERROR_INT("naa1 and naa2 number counts differ", procName, 1);
2107     if (256 * n != nt)  /* good enough check */
2108         return ERROR_INT("na sizes must be 256", procName, 1);
2109 
2110     nad = numaCreate(n);
2111     *pnad = nad;
2112     for (i = 0; i < n; i++) {
2113         na1 = numaaGetNuma(naa1, i, L_CLONE);
2114         na2 = numaaGetNuma(naa2, i, L_CLONE);
2115         numaEarthMoverDistance(na1, na2, &dist);
2116         numaAddNumber(nad, dist / 255.);  /* normalize to [0.0 - 1.0] */
2117         numaDestroy(&na1);
2118         numaDestroy(&na2);
2119     }
2120     return 0;
2121 }
2122 
2123 
2124 /*!
2125  * \brief   numaEarthMoverDistance()
2126  *
2127  * \param[in]    na1, na2 two numas of the same size, typically histograms
2128  * \param[out]   pdist EM distance
2129  * \return  0 if OK, 1 on error
2130  *
2131  * <pre>
2132  * Notes:
2133  *     (1) The two numas must have the same size.  They do not need to be
2134  *         normalized to the same sum before applying the function.
2135  *     (2) For a 1D discrete function, the implementation of the EMD
2136  *         is trivial.  Just keep filling or emptying buckets in one numa
2137  *         to match the amount in the other, moving sequentially along
2138  *         both arrays.
2139  *     (3) We divide the sum of the absolute value of everything moved
2140  *         (by 1 unit at a time) by the sum of the numa (amount of "earth")
2141  *         to get the average distance that the "earth" was moved.
2142  *         This is the value returned here.
2143  *     (4) The caller can do a further normalization, by the number of
2144  *         buckets (minus 1), to get the EM distance as a fraction of
2145  *         the maximum possible distance, which is n-1.  This fraction
2146  *         is 1.0 for the situation where all the 'earth' in the first
2147  *         array is at one end, and all in the second array is at the
2148  *         other end.
2149  * </pre>
2150  */
2151 l_int32
numaEarthMoverDistance(NUMA * na1,NUMA * na2,l_float32 * pdist)2152 numaEarthMoverDistance(NUMA       *na1,
2153                        NUMA       *na2,
2154                        l_float32  *pdist)
2155 {
2156 l_int32     n, norm, i;
2157 l_float32   sum1, sum2, diff, total;
2158 l_float32  *array1, *array3;
2159 NUMA       *na3;
2160 
2161     PROCNAME("numaEarthMoverDistance");
2162 
2163     if (!pdist)
2164         return ERROR_INT("&dist not defined", procName, 1);
2165     *pdist = 0.0;
2166     if (!na1 || !na2)
2167         return ERROR_INT("na1 and na2 not both defined", procName, 1);
2168     n = numaGetCount(na1);
2169     if (n != numaGetCount(na2))
2170         return ERROR_INT("na1 and na2 have different size", procName, 1);
2171 
2172         /* Generate na3; normalize to na1 if necessary */
2173     numaGetSum(na1, &sum1);
2174     numaGetSum(na2, &sum2);
2175     norm = (L_ABS(sum1 - sum2) < 0.00001 * L_ABS(sum1)) ? 1 : 0;
2176     if (!norm)
2177         na3 = numaTransform(na2, 0, sum1 / sum2);
2178     else
2179         na3 = numaCopy(na2);
2180     array1 = numaGetFArray(na1, L_NOCOPY);
2181     array3 = numaGetFArray(na3, L_NOCOPY);
2182 
2183         /* Move earth in n3 from array elements, to match n1 */
2184     total = 0;
2185     for (i = 1; i < n; i++) {
2186         diff = array1[i - 1] - array3[i - 1];
2187         array3[i] -= diff;
2188         total += L_ABS(diff);
2189     }
2190     *pdist = total / sum1;
2191 
2192     numaDestroy(&na3);
2193     return 0;
2194 }
2195 
2196 
2197 /*!
2198  * \brief   grayInterHistogramStats()
2199  *
2200  * \param[in]    naa numaa with two or more 256-element histograms
2201  * \param[in]    wc half-width of the smoothing window
2202  * \param[out]   pnam [optional] mean values
2203  * \param[out]   pnams [optional] mean square values
2204  * \param[out]   pnav [optional] variances
2205  * \param[out]   pnarv [optional] rms deviations from the mean
2206  * \return  0 if OK, 1 on error
2207  *
2208  * <pre>
2209  * Notes:
2210  *     (1) The %naa has two or more 256-element numa histograms, which
2211  *         are to be compared value-wise at each of the 256 gray levels.
2212  *         The result are stats (mean, mean square, variance, root variance)
2213  *         aggregated across the set of histograms, and each is output
2214  *         as a 256 entry numa.  Think of these histograms as a matrix,
2215  *         where each histogram is one row of the array.  The stats are
2216  *         then aggregated column-wise, between the histograms.
2217  *     (2) These stats are:
2218  *            ~ average value: <v>  (nam)
2219  *            ~ average squared value: <v*v> (nams)
2220  *            ~ variance: <(v - <v>)*(v - <v>)> = <v*v> - <v>*<v>  (nav)
2221  *            ~ square-root of variance: (narv)
2222  *         where the brackets < .. > indicate that the average value is
2223  *         to be taken over each column of the array.
2224  *     (3) The input histograms are optionally smoothed before these
2225  *         statistical operations.
2226  *     (4) The input histograms are normalized to a sum of 10000.  By
2227  *         doing this, the resulting numbers are independent of the
2228  *         number of samples used in building the individual histograms.
2229  *     (5) A typical application is on a set of histograms from tiles
2230  *         of an image, to distinguish between text/tables and photo
2231  *         regions.  If the tiles are much larger than the text line
2232  *         spacing, text/table regions typically have smaller variance
2233  *         across tiles than photo regions.  For this application, it
2234  *         may be useful to ignore values near white, which are large for
2235  *         text and would magnify the variance due to variations in
2236  *         illumination.  However, because the variance of a drawing or
2237  *         a light photo can be similar to that of grayscale text, this
2238  *         function is only a discriminator between darker photos/drawings
2239  *         and light photos/text/line-graphics.
2240  * </pre>
2241  */
2242 l_int32
grayInterHistogramStats(NUMAA * naa,l_int32 wc,NUMA ** pnam,NUMA ** pnams,NUMA ** pnav,NUMA ** pnarv)2243 grayInterHistogramStats(NUMAA   *naa,
2244                         l_int32  wc,
2245                         NUMA   **pnam,
2246                         NUMA   **pnams,
2247                         NUMA   **pnav,
2248                         NUMA   **pnarv)
2249 {
2250 l_int32      i, j, n, nn;
2251 l_float32  **arrays;
2252 l_float32    mean, var, rvar;
2253 NUMA        *na1, *na2, *na3, *na4;
2254 
2255     PROCNAME("grayInterHistogramStats");
2256 
2257     if (pnam) *pnam = NULL;
2258     if (pnams) *pnams = NULL;
2259     if (pnav) *pnav = NULL;
2260     if (pnarv) *pnarv = NULL;
2261     if (!pnam && !pnams && !pnav && !pnarv)
2262         return ERROR_INT("nothing requested", procName, 1);
2263     if (!naa)
2264         return ERROR_INT("naa not defined", procName, 1);
2265     n = numaaGetCount(naa);
2266     for (i = 0; i < n; i++) {
2267         nn = numaaGetNumaCount(naa, i);
2268         if (nn != 256) {
2269             L_ERROR("%d numbers in numa[%d]\n", procName, nn, i);
2270             return 1;
2271         }
2272     }
2273 
2274     if (pnam) *pnam = numaCreate(256);
2275     if (pnams) *pnams = numaCreate(256);
2276     if (pnav) *pnav = numaCreate(256);
2277     if (pnarv) *pnarv = numaCreate(256);
2278 
2279         /* First, use mean smoothing, normalize each histogram,
2280          * and save all results in a 2D matrix. */
2281     arrays = (l_float32 **)LEPT_CALLOC(n, sizeof(l_float32 *));
2282     for (i = 0; i < n; i++) {
2283         na1 = numaaGetNuma(naa, i, L_CLONE);
2284         na2 = numaWindowedMean(na1, wc);
2285         na3 = numaNormalizeHistogram(na2, 10000.);
2286         arrays[i] = numaGetFArray(na3, L_COPY);
2287         numaDestroy(&na1);
2288         numaDestroy(&na2);
2289         numaDestroy(&na3);
2290     }
2291 
2292         /* Get stats between histograms */
2293     for (j = 0; j < 256; j++) {
2294         na4 = numaCreate(n);
2295         for (i = 0; i < n; i++) {
2296             numaAddNumber(na4, arrays[i][j]);
2297         }
2298         numaSimpleStats(na4, 0, 0, &mean, &var, &rvar);
2299         if (pnam) numaAddNumber(*pnam, mean);
2300         if (pnams) numaAddNumber(*pnams, mean * mean);
2301         if (pnav) numaAddNumber(*pnav, var);
2302         if (pnarv) numaAddNumber(*pnarv, rvar);
2303         numaDestroy(&na4);
2304     }
2305 
2306     for (i = 0; i < n; i++)
2307         LEPT_FREE(arrays[i]);
2308     LEPT_FREE(arrays);
2309     return 0;
2310 }
2311 
2312 
2313 /*----------------------------------------------------------------------*
2314  *                             Extrema finding                          *
2315  *----------------------------------------------------------------------*/
2316 /*!
2317  * \brief   numaFindPeaks()
2318  *
2319  * \param[in]    nas     source numa
2320  * \param[in]    nmax    max number of peaks to be found
2321  * \param[in]    fract1  min fraction of peak value
2322  * \param[in]    fract2  min slope
2323  * \return  peak na, or NULL on error.
2324  *
2325  * <pre>
2326  * Notes:
2327  *     (1) The returned na consists of sets of four numbers representing
2328  *         the peak, in the following order:
2329  *            left edge; peak center; right edge; normalized peak area
2330  * </pre>
2331  */
2332 NUMA *
numaFindPeaks(NUMA * nas,l_int32 nmax,l_float32 fract1,l_float32 fract2)2333 numaFindPeaks(NUMA      *nas,
2334               l_int32    nmax,
2335               l_float32  fract1,
2336               l_float32  fract2)
2337 {
2338 l_int32    i, k, n, maxloc, lloc, rloc;
2339 l_float32  fmaxval, sum, total, newtotal, val, lastval;
2340 l_float32  peakfract;
2341 NUMA      *na, *napeak;
2342 
2343     PROCNAME("numaFindPeaks");
2344 
2345     if (!nas)
2346         return (NUMA *)ERROR_PTR("nas not defined", procName, NULL);
2347     n = numaGetCount(nas);
2348     numaGetSum(nas, &total);
2349 
2350         /* We munge this copy */
2351     if ((na = numaCopy(nas)) == NULL)
2352         return (NUMA *)ERROR_PTR("na not made", procName, NULL);
2353     if ((napeak = numaCreate(4 * nmax)) == NULL) {
2354         numaDestroy(&na);
2355         return (NUMA *)ERROR_PTR("napeak not made", procName, NULL);
2356     }
2357 
2358     for (k = 0; k < nmax; k++) {
2359         numaGetSum(na, &newtotal);
2360         if (newtotal == 0.0)   /* sanity check */
2361             break;
2362         numaGetMax(na, &fmaxval, &maxloc);
2363         sum = fmaxval;
2364         lastval = fmaxval;
2365         lloc = 0;
2366         for (i = maxloc - 1; i >= 0; --i) {
2367             numaGetFValue(na, i, &val);
2368             if (val == 0.0) {
2369                 lloc = i + 1;
2370                 break;
2371             }
2372             if (val > fract1 * fmaxval) {
2373                 sum += val;
2374                 lastval = val;
2375                 continue;
2376             }
2377             if (lastval - val > fract2 * lastval) {
2378                 sum += val;
2379                 lastval = val;
2380                 continue;
2381             }
2382             lloc = i;
2383             break;
2384         }
2385         lastval = fmaxval;
2386         rloc = n - 1;
2387         for (i = maxloc + 1; i < n; ++i) {
2388             numaGetFValue(na, i, &val);
2389             if (val == 0.0) {
2390                 rloc = i - 1;
2391                 break;
2392             }
2393             if (val > fract1 * fmaxval) {
2394                 sum += val;
2395                 lastval = val;
2396                 continue;
2397             }
2398             if (lastval - val > fract2 * lastval) {
2399                 sum += val;
2400                 lastval = val;
2401                 continue;
2402             }
2403             rloc = i;
2404             break;
2405         }
2406         peakfract = sum / total;
2407         numaAddNumber(napeak, lloc);
2408         numaAddNumber(napeak, maxloc);
2409         numaAddNumber(napeak, rloc);
2410         numaAddNumber(napeak, peakfract);
2411 
2412         for (i = lloc; i <= rloc; i++)
2413             numaSetValue(na, i, 0.0);
2414     }
2415 
2416     numaDestroy(&na);
2417     return napeak;
2418 }
2419 
2420 
2421 /*!
2422  * \brief   numaFindExtrema()
2423  *
2424  * \param[in]    nas input values
2425  * \param[in]    delta relative amount to resolve peaks and valleys
2426  * \param[out]   pnav [optional] values of extrema
2427  * \return  nad (locations of extrema, or NULL on error
2428  *
2429  * <pre>
2430  * Notes:
2431  *      (1) This returns a sequence of extrema (peaks and valleys).
2432  *      (2) The algorithm is analogous to that for determining
2433  *          mountain peaks.  Suppose we have a local peak, with
2434  *          bumps on the side.  Under what conditions can we consider
2435  *          those 'bumps' to be actual peaks?  The answer: if the
2436  *          bump is separated from the peak by a saddle that is at
2437  *          least 500 feet below the bump.
2438  *      (3) Operationally, suppose we are looking for a peak.
2439  *          We are keeping the largest value we've seen since the
2440  *          last valley, and are looking for a value that is delta
2441  *          BELOW our current peak.  When we find such a value,
2442  *          we label the peak, use the current value to label the
2443  *          valley, and then do the same operation in reverse (looking
2444  *          for a valley).
2445  * </pre>
2446  */
2447 NUMA *
numaFindExtrema(NUMA * nas,l_float32 delta,NUMA ** pnav)2448 numaFindExtrema(NUMA      *nas,
2449                 l_float32  delta,
2450                 NUMA     **pnav)
2451 {
2452 l_int32    i, n, found, loc, direction;
2453 l_float32  startval, val, maxval, minval;
2454 NUMA      *nav, *nad;
2455 
2456     PROCNAME("numaFindExtrema");
2457 
2458     if (pnav) *pnav = NULL;
2459     if (!nas)
2460         return (NUMA *)ERROR_PTR("nas not defined", procName, NULL);
2461 
2462     n = numaGetCount(nas);
2463     nad = numaCreate(0);
2464     nav =  NULL;
2465     if (pnav) {
2466         nav = numaCreate(0);
2467         *pnav = nav;
2468     }
2469 
2470         /* We don't know if we'll find a peak or valley first,
2471          * but use the first element of nas as the reference point.
2472          * Break when we deviate by 'delta' from the first point. */
2473     numaGetFValue(nas, 0, &startval);
2474     found = FALSE;
2475     for (i = 1; i < n; i++) {
2476         numaGetFValue(nas, i, &val);
2477         if (L_ABS(val - startval) >= delta) {
2478             found = TRUE;
2479             break;
2480         }
2481     }
2482 
2483     if (!found)
2484         return nad;  /* it's empty */
2485 
2486         /* Are we looking for a peak or a valley? */
2487     if (val > startval) {  /* peak */
2488         direction = 1;
2489         maxval = val;
2490     } else {
2491         direction = -1;
2492         minval = val;
2493     }
2494     loc = i;
2495 
2496         /* Sweep through the rest of the array, recording alternating
2497          * peak/valley extrema. */
2498     for (i = i + 1; i < n; i++) {
2499         numaGetFValue(nas, i, &val);
2500         if (direction == 1 && val > maxval ) {  /* new local max */
2501             maxval = val;
2502             loc = i;
2503         } else if (direction == -1 && val < minval ) {  /* new local min */
2504             minval = val;
2505             loc = i;
2506         } else if (direction == 1 && (maxval - val >= delta)) {
2507             numaAddNumber(nad, loc);  /* save the current max location */
2508             if (nav) numaAddNumber(nav, maxval);
2509             direction = -1;  /* reverse: start looking for a min */
2510             minval = val;
2511             loc = i;  /* current min location */
2512         } else if (direction == -1 && (val - minval >= delta)) {
2513             numaAddNumber(nad, loc);  /* save the current min location */
2514             if (nav) numaAddNumber(nav, minval);
2515             direction = 1;  /* reverse: start looking for a max */
2516             maxval = val;
2517             loc = i;  /* current max location */
2518         }
2519     }
2520 
2521         /* Save the final extremum */
2522 /*    numaAddNumber(nad, loc); */
2523     return nad;
2524 }
2525 
2526 
2527 /*!
2528  * \brief   numaCountReversals()
2529  *
2530  * \param[in]    nas input values
2531  * \param[in]    minreversal relative amount to resolve peaks and valleys
2532  * \param[out]   pnr [optional] number of reversals
2533  *           [out]   pnrpl ([optional] reversal density: reversals/length
2534  * \return  0 if OK, 1 on error
2535  *
2536  * <pre>
2537  * Notes:
2538  *      (1) The input numa is can be generated from pixExtractAlongLine().
2539  *          If so, the x parameters can be used to find the reversal
2540  *          frequency along a line.
2541  * </pre>
2542  */
2543 l_int32
numaCountReversals(NUMA * nas,l_float32 minreversal,l_int32 * pnr,l_float32 * pnrpl)2544 numaCountReversals(NUMA       *nas,
2545                    l_float32   minreversal,
2546                    l_int32    *pnr,
2547                    l_float32  *pnrpl)
2548 {
2549 l_int32    n, nr;
2550 l_float32  delx, len;
2551 NUMA      *nat;
2552 
2553     PROCNAME("numaCountReversals");
2554 
2555     if (pnr) *pnr = 0;
2556     if (pnrpl) *pnrpl = 0.0;
2557     if (!pnr && !pnrpl)
2558         return ERROR_INT("neither &nr nor &nrpl are defined", procName, 1);
2559     if (!nas)
2560         return ERROR_INT("nas not defined", procName, 1);
2561 
2562     n = numaGetCount(nas);
2563     nat = numaFindExtrema(nas, minreversal, NULL);
2564     nr = numaGetCount(nat);
2565     if (pnr) *pnr = nr;
2566     if (pnrpl) {
2567         numaGetParameters(nas, NULL, &delx);
2568         len = delx * n;
2569         *pnrpl = (l_float32)nr / len;
2570     }
2571 
2572     numaDestroy(&nat);
2573     return 0;
2574 }
2575 
2576 
2577 /*----------------------------------------------------------------------*
2578  *                Threshold crossings and frequency analysis            *
2579  *----------------------------------------------------------------------*/
2580 /*!
2581  * \brief   numaSelectCrossingThreshold()
2582  *
2583  * \param[in]    nax [optional] numa of abscissa values; can be NULL
2584  * \param[in]    nay signal
2585  * \param[in]    estthresh estimated pixel threshold for crossing: e.g., for
2586  *                         images, white <--> black; typ. ~120
2587  * \param[out]   pbestthresh robust estimate of threshold to use
2588  * \return  0 if OK, 1 on error
2589  *
2590  * <pre>
2591  * Notes:
2592  *     (1) When a valid threshold is used, the number of crossings is
2593  *         a maximum, because none are missed.  If no threshold intersects
2594  *         all the crossings, the crossings must be determined with
2595  *         numaCrossingsByPeaks().
2596  *     (2) %estthresh is an input estimate of the threshold that should
2597  *         be used.  We compute the crossings with 41 thresholds
2598  *         (20 below and 20 above).  There is a range in which the
2599  *         number of crossings is a maximum.  Return a threshold
2600  *         in the center of this stable plateau of crossings.
2601  *         This can then be used with numaCrossingsByThreshold()
2602  *         to get a good estimate of crossing locations.
2603  * </pre>
2604  */
2605 l_int32
numaSelectCrossingThreshold(NUMA * nax,NUMA * nay,l_float32 estthresh,l_float32 * pbestthresh)2606 numaSelectCrossingThreshold(NUMA       *nax,
2607                             NUMA       *nay,
2608                             l_float32   estthresh,
2609                             l_float32  *pbestthresh)
2610 {
2611 l_int32    i, inrun, istart, iend, maxstart, maxend, runlen, maxrunlen;
2612 l_int32    val, maxval, nmax, count;
2613 l_float32  thresh, fmaxval, fmodeval;
2614 NUMA      *nat, *nac;
2615 
2616     PROCNAME("numaSelectCrossingThreshold");
2617 
2618     if (!pbestthresh)
2619         return ERROR_INT("&bestthresh not defined", procName, 1);
2620     *pbestthresh = 0.0;
2621     if (!nay)
2622         return ERROR_INT("nay not defined", procName, 1);
2623 
2624         /* Compute the number of crossings for different thresholds */
2625     nat = numaCreate(41);
2626     for (i = 0; i < 41; i++) {
2627         thresh = estthresh - 80.0 + 4.0 * i;
2628         nac = numaCrossingsByThreshold(nax, nay, thresh);
2629         numaAddNumber(nat, numaGetCount(nac));
2630         numaDestroy(&nac);
2631     }
2632 
2633         /* Find the center of the plateau of max crossings, which
2634          * extends from thresh[istart] to thresh[iend]. */
2635     numaGetMax(nat, &fmaxval, NULL);
2636     maxval = (l_int32)fmaxval;
2637     nmax = 0;
2638     for (i = 0; i < 41; i++) {
2639         numaGetIValue(nat, i, &val);
2640         if (val == maxval)
2641             nmax++;
2642     }
2643     if (nmax < 3) {  /* likely accidental max; try the mode */
2644         numaGetMode(nat, &fmodeval, &count);
2645         if (count > nmax && fmodeval > 0.5 * fmaxval)
2646             maxval = (l_int32)fmodeval;  /* use the mode */
2647     }
2648 
2649     inrun = FALSE;
2650     iend = 40;
2651     maxrunlen = 0, maxstart = 0, maxend = 0;
2652     for (i = 0; i < 41; i++) {
2653         numaGetIValue(nat, i, &val);
2654         if (val == maxval) {
2655             if (!inrun) {
2656                 istart = i;
2657                 inrun = TRUE;
2658             }
2659             continue;
2660         }
2661         if (inrun && (val != maxval)) {
2662             iend = i - 1;
2663             runlen = iend - istart + 1;
2664             inrun = FALSE;
2665             if (runlen > maxrunlen) {
2666                 maxstart = istart;
2667                 maxend = iend;
2668                 maxrunlen = runlen;
2669             }
2670         }
2671     }
2672     if (inrun) {
2673         runlen = i - istart;
2674         if (runlen > maxrunlen) {
2675             maxstart = istart;
2676             maxend = i - 1;
2677             maxrunlen = runlen;
2678         }
2679     }
2680 
2681     *pbestthresh = estthresh - 80.0 + 2.0 * (l_float32)(maxstart + maxend);
2682 
2683 #if  DEBUG_CROSSINGS
2684     fprintf(stderr, "\nCrossings attain a maximum at %d thresholds, between:\n"
2685                     "  thresh[%d] = %5.1f and thresh[%d] = %5.1f\n",
2686                     nmax, maxstart, estthresh - 80.0 + 4.0 * maxstart,
2687                     maxend, estthresh - 80.0 + 4.0 * maxend);
2688     fprintf(stderr, "The best choice: %5.1f\n", *pbestthresh);
2689     fprintf(stderr, "Number of crossings at the 41 thresholds:");
2690     numaWriteStream(stderr, nat);
2691 #endif  /* DEBUG_CROSSINGS */
2692 
2693     numaDestroy(&nat);
2694     return 0;
2695 }
2696 
2697 
2698 /*!
2699  * \brief   numaCrossingsByThreshold()
2700  *
2701  * \param[in]    nax [optional] numa of abscissa values; can be NULL
2702  * \param[in]    nay numa of ordinate values, corresponding to nax
2703  * \param[in]    thresh threshold value for nay
2704  * \return  nad abscissa pts at threshold, or NULL on error
2705  *
2706  * <pre>
2707  * Notes:
2708  *      (1) If nax == NULL, we use startx and delx from nay to compute
2709  *          the crossing values in nad.
2710  * </pre>
2711  */
2712 NUMA *
numaCrossingsByThreshold(NUMA * nax,NUMA * nay,l_float32 thresh)2713 numaCrossingsByThreshold(NUMA      *nax,
2714                          NUMA      *nay,
2715                          l_float32  thresh)
2716 {
2717 l_int32    i, n;
2718 l_float32  startx, delx;
2719 l_float32  xval1, xval2, yval1, yval2, delta1, delta2, crossval, fract;
2720 NUMA      *nad;
2721 
2722     PROCNAME("numaCrossingsByThreshold");
2723 
2724     if (!nay)
2725         return (NUMA *)ERROR_PTR("nay not defined", procName, NULL);
2726     n = numaGetCount(nay);
2727 
2728     if (nax && (numaGetCount(nax) != n))
2729         return (NUMA *)ERROR_PTR("nax and nay sizes differ", procName, NULL);
2730 
2731     nad = numaCreate(0);
2732     numaGetFValue(nay, 0, &yval1);
2733     numaGetParameters(nay, &startx, &delx);
2734     if (nax)
2735         numaGetFValue(nax, 0, &xval1);
2736     else
2737         xval1 = startx;
2738     for (i = 1; i < n; i++) {
2739         numaGetFValue(nay, i, &yval2);
2740         if (nax)
2741             numaGetFValue(nax, i, &xval2);
2742         else
2743             xval2 = startx + i * delx;
2744         delta1 = yval1 - thresh;
2745         delta2 = yval2 - thresh;
2746         if (delta1 == 0.0) {
2747             numaAddNumber(nad, xval1);
2748         } else if (delta2 == 0.0) {
2749             numaAddNumber(nad, xval2);
2750         } else if (delta1 * delta2 < 0.0) {  /* crossing */
2751             fract = L_ABS(delta1) / L_ABS(yval1 - yval2);
2752             crossval = xval1 + fract * (xval2 - xval1);
2753             numaAddNumber(nad, crossval);
2754         }
2755         xval1 = xval2;
2756         yval1 = yval2;
2757     }
2758 
2759     return nad;
2760 }
2761 
2762 
2763 /*!
2764  * \brief   numaCrossingsByPeaks()
2765  *
2766  * \param[in]    nax [optional] numa of abscissa values
2767  * \param[in]    nay numa of ordinate values, corresponding to nax
2768  * \param[in]    delta parameter used to identify when a new peak can be found
2769  * \return  nad abscissa pts at threshold, or NULL on error
2770  *
2771  * <pre>
2772  * Notes:
2773  *      (1) If nax == NULL, we use startx and delx from nay to compute
2774  *          the crossing values in nad.
2775  * </pre>
2776  */
2777 NUMA *
numaCrossingsByPeaks(NUMA * nax,NUMA * nay,l_float32 delta)2778 numaCrossingsByPeaks(NUMA      *nax,
2779                      NUMA      *nay,
2780                      l_float32  delta)
2781 {
2782 l_int32    i, j, n, np, previndex, curindex;
2783 l_float32  startx, delx;
2784 l_float32  xval1, xval2, yval1, yval2, delta1, delta2;
2785 l_float32  prevval, curval, thresh, crossval, fract;
2786 NUMA      *nap, *nad;
2787 
2788     PROCNAME("numaCrossingsByPeaks");
2789 
2790     if (!nay)
2791         return (NUMA *)ERROR_PTR("nay not defined", procName, NULL);
2792 
2793     n = numaGetCount(nay);
2794     if (nax && (numaGetCount(nax) != n))
2795         return (NUMA *)ERROR_PTR("nax and nay sizes differ", procName, NULL);
2796 
2797         /* Find the extrema.  Also add last point in nay to get
2798          * the last transition (from the last peak to the end).
2799          * The number of crossings is 1 more than the number of extrema. */
2800     nap = numaFindExtrema(nay, delta, NULL);
2801     numaAddNumber(nap, n - 1);
2802     np = numaGetCount(nap);
2803     L_INFO("Number of crossings: %d\n", procName, np);
2804 
2805         /* Do all computation in index units of nax or the delx of nay */
2806     nad = numaCreate(np);  /* output crossing locations, in nax units */
2807     previndex = 0;  /* prime the search with 1st point */
2808     numaGetFValue(nay, 0, &prevval);  /* prime the search with 1st point */
2809     numaGetParameters(nay, &startx, &delx);
2810     for (i = 0; i < np; i++) {
2811         numaGetIValue(nap, i, &curindex);
2812         numaGetFValue(nay, curindex, &curval);
2813         thresh = (prevval + curval) / 2.0;
2814         if (nax)
2815             numaGetFValue(nax, previndex, &xval1);
2816         else
2817             xval1 = startx + previndex * delx;
2818         numaGetFValue(nay, previndex, &yval1);
2819         for (j = previndex + 1; j <= curindex; j++) {
2820             if (nax)
2821                 numaGetFValue(nax, j, &xval2);
2822             else
2823                 xval2 = startx + j * delx;
2824             numaGetFValue(nay, j, &yval2);
2825             delta1 = yval1 - thresh;
2826             delta2 = yval2 - thresh;
2827             if (delta1 == 0.0) {
2828                 numaAddNumber(nad, xval1);
2829                 break;
2830             } else if (delta2 == 0.0) {
2831                 numaAddNumber(nad, xval2);
2832                 break;
2833             } else if (delta1 * delta2 < 0.0) {  /* crossing */
2834                 fract = L_ABS(delta1) / L_ABS(yval1 - yval2);
2835                 crossval = xval1 + fract * (xval2 - xval1);
2836                 numaAddNumber(nad, crossval);
2837                 break;
2838             }
2839             xval1 = xval2;
2840             yval1 = yval2;
2841         }
2842         previndex = curindex;
2843         prevval = curval;
2844     }
2845 
2846     numaDestroy(&nap);
2847     return nad;
2848 }
2849 
2850 
2851 /*!
2852  * \brief   numaEvalBestHaarParameters()
2853  *
2854  * \param[in]    nas numa of non-negative signal values
2855  * \param[in]    relweight relative weight of (-1 comb) / (+1 comb)
2856  *                          contributions to the 'convolution'.  In effect,
2857  *                          the convolution kernel is a comb consisting of
2858  *                          alternating +1 and -weight.
2859  * \param[in]    nwidth number of widths to consider
2860  * \param[in]    nshift number of shifts to consider for each width
2861  * \param[in]    minwidth smallest width to consider
2862  * \param[in]    maxwidth largest width to consider
2863  * \param[out]   pbestwidth width giving largest score
2864  * \param[out]   pbestshift shift giving largest score
2865  * \param[out]   pbestscore [optional] convolution with
2866  *                          "Haar"-like comb
2867  * \return  0 if OK, 1 on error
2868  *
2869  * <pre>
2870  * Notes:
2871  *      (1) This does a linear sweep of widths, evaluating at %nshift
2872  *          shifts for each width, computing the score from a convolution
2873  *          with a long comb, and finding the (width, shift) pair that
2874  *          gives the maximum score.  The best width is the "half-wavelength"
2875  *          of the signal.
2876  *      (2) The convolving function is a comb of alternating values
2877  *          +1 and -1 * relweight, separated by the width and phased by
2878  *          the shift.  This is similar to a Haar transform, except
2879  *          there the convolution is performed with a square wave.
2880  *      (3) The function is useful for finding the line spacing
2881  *          and strength of line signal from pixel sum projections.
2882  *      (4) The score is normalized to the size of nas divided by
2883  *          the number of half-widths.  For image applications, the input is
2884  *          typically an array of pixel projections, so one should
2885  *          normalize by dividing the score by the image width in the
2886  *          pixel projection direction.
2887  * </pre>
2888  */
2889 l_int32
numaEvalBestHaarParameters(NUMA * nas,l_float32 relweight,l_int32 nwidth,l_int32 nshift,l_float32 minwidth,l_float32 maxwidth,l_float32 * pbestwidth,l_float32 * pbestshift,l_float32 * pbestscore)2890 numaEvalBestHaarParameters(NUMA       *nas,
2891                            l_float32   relweight,
2892                            l_int32     nwidth,
2893                            l_int32     nshift,
2894                            l_float32   minwidth,
2895                            l_float32   maxwidth,
2896                            l_float32  *pbestwidth,
2897                            l_float32  *pbestshift,
2898                            l_float32  *pbestscore)
2899 {
2900 l_int32    i, j;
2901 l_float32  delwidth, delshift, width, shift, score;
2902 l_float32  bestwidth, bestshift, bestscore;
2903 
2904     PROCNAME("numaEvalBestHaarParameters");
2905 
2906     if (pbestscore) *pbestscore = 0.0;
2907     if (pbestwidth) *pbestwidth = 0.0;
2908     if (pbestshift) *pbestshift = 0.0;
2909     if (!pbestwidth || !pbestshift)
2910         return ERROR_INT("&bestwidth and &bestshift not defined", procName, 1);
2911     if (!nas)
2912         return ERROR_INT("nas not defined", procName, 1);
2913 
2914     bestscore = bestwidth = bestshift = 0.0;
2915     delwidth = (maxwidth - minwidth) / (nwidth - 1.0);
2916     for (i = 0; i < nwidth; i++) {
2917         width = minwidth + delwidth * i;
2918         delshift = width / (l_float32)(nshift);
2919         for (j = 0; j < nshift; j++) {
2920             shift = j * delshift;
2921             numaEvalHaarSum(nas, width, shift, relweight, &score);
2922             if (score > bestscore) {
2923                 bestscore = score;
2924                 bestwidth = width;
2925                 bestshift = shift;
2926 #if  DEBUG_FREQUENCY
2927                 fprintf(stderr, "width = %7.3f, shift = %7.3f, score = %7.3f\n",
2928                         width, shift, score);
2929 #endif  /* DEBUG_FREQUENCY */
2930             }
2931         }
2932     }
2933 
2934     *pbestwidth = bestwidth;
2935     *pbestshift = bestshift;
2936     if (pbestscore)
2937         *pbestscore = bestscore;
2938     return 0;
2939 }
2940 
2941 
2942 /*!
2943  * \brief   numaEvalHaarSum()
2944  *
2945  * \param[in]    nas numa of non-negative signal values
2946  * \param[in]    width distance between +1 and -1 in convolution comb
2947  * \param[in]    shift phase of the comb: location of first +1
2948  * \param[in]    relweight relative weight of (-1 comb) / (+1 comb)
2949  *                          contributions to the 'convolution'.  In effect,
2950  *                          the convolution kernel is a comb consisting of
2951  *                          alternating +1 and -weight.
2952  * \param[out]   pscore convolution with "Haar"-like comb
2953  * \return  0 if OK, 1 on error
2954  *
2955  * <pre>
2956  * Notes:
2957  *      (1) This does a convolution with a comb of alternating values
2958  *          +1 and -relweight, separated by the width and phased by the shift.
2959  *          This is similar to a Haar transform, except that for Haar,
2960  *            (1) the convolution kernel is symmetric about 0, so the
2961  *                relweight is 1.0, and
2962  *            (2) the convolution is performed with a square wave.
2963  *      (2) The score is normalized to the size of nas divided by
2964  *          twice the "width".  For image applications, the input is
2965  *          typically an array of pixel projections, so one should
2966  *          normalize by dividing the score by the image width in the
2967  *          pixel projection direction.
2968  *      (3) To get a Haar-like result, use relweight = 1.0.  For detecting
2969  *          signals where you expect every other sample to be close to
2970  *          zero, as with barcodes or filtered text lines, you can
2971  *          use relweight > 1.0.
2972  * </pre>
2973  */
2974 l_int32
numaEvalHaarSum(NUMA * nas,l_float32 width,l_float32 shift,l_float32 relweight,l_float32 * pscore)2975 numaEvalHaarSum(NUMA       *nas,
2976                 l_float32   width,
2977                 l_float32   shift,
2978                 l_float32   relweight,
2979                 l_float32  *pscore)
2980 {
2981 l_int32    i, n, nsamp, index;
2982 l_float32  score, weight, val;
2983 
2984     PROCNAME("numaEvalHaarSum");
2985 
2986     if (!pscore)
2987         return ERROR_INT("&score not defined", procName, 1);
2988     *pscore = 0.0;
2989     if (!nas)
2990         return ERROR_INT("nas not defined", procName, 1);
2991     if ((n = numaGetCount(nas)) < 2 * width)
2992         return ERROR_INT("nas size too small", procName, 1);
2993 
2994     score = 0.0;
2995     nsamp = (l_int32)((n - shift) / width);
2996     for (i = 0; i < nsamp; i++) {
2997         index = (l_int32)(shift + i * width);
2998         weight = (i % 2) ? 1.0 : -1.0 * relweight;
2999         numaGetFValue(nas, index, &val);
3000         score += weight * val;
3001     }
3002 
3003     *pscore = 2.0 * width * score / (l_float32)n;
3004     return 0;
3005 }
3006 
3007 
3008 /*----------------------------------------------------------------------*
3009  *            Generating numbers in a range under constraints           *
3010  *----------------------------------------------------------------------*/
3011 /*!
3012  * \brief   genConstrainedNumaInRange()
3013  *
3014  * \param[in]    first first number to choose; >= 0
3015  * \param[in]    last biggest possible number to reach; >= first
3016  * \param[in]    nmax maximum number of numbers to select; > 0
3017  * \param[in]    use_pairs 1 = select pairs of adjacent numbers;
3018  *                         0 = select individual numbers
3019  * \return  0 if OK, 1 on error
3020  *
3021  * <pre>
3022  * Notes:
3023  *     (1) Selection is made uniformly in the range.  This can be used
3024  *         to select pages distributed as uniformly as possible
3025  *         through a book, where you are constrained to:
3026  *          ~ choose between [first, ... biggest],
3027  *          ~ choose no more than nmax numbers, and
3028  *         and you have the option of requiring pairs of adjacent numbers.
3029  * </pre>
3030  */
3031 NUMA *
genConstrainedNumaInRange(l_int32 first,l_int32 last,l_int32 nmax,l_int32 use_pairs)3032 genConstrainedNumaInRange(l_int32  first,
3033                           l_int32  last,
3034                           l_int32  nmax,
3035                           l_int32  use_pairs)
3036 {
3037 l_int32    i, nsets, val;
3038 l_float32  delta;
3039 NUMA      *na;
3040 
3041     PROCNAME("genConstrainedNumaInRange");
3042 
3043     first = L_MAX(0, first);
3044     if (last < first)
3045         return (NUMA *)ERROR_PTR("last < first!", procName, NULL);
3046     if (nmax < 1)
3047         return (NUMA *)ERROR_PTR("nmax < 1!", procName, NULL);
3048 
3049     nsets = L_MIN(nmax, last - first + 1);
3050     if (use_pairs == 1)
3051         nsets = nsets / 2;
3052     if (nsets == 0)
3053         return (NUMA *)ERROR_PTR("nsets == 0", procName, NULL);
3054 
3055         /* Select delta so that selection covers the full range if possible */
3056     if (nsets == 1) {
3057         delta = 0.0;
3058     } else {
3059         if (use_pairs == 0)
3060             delta = (l_float32)(last - first) / (nsets - 1);
3061         else
3062             delta = (l_float32)(last - first - 1) / (nsets - 1);
3063     }
3064 
3065     na = numaCreate(nsets);
3066     for (i = 0; i < nsets; i++) {
3067         val = (l_int32)(first + i * delta + 0.5);
3068         numaAddNumber(na, val);
3069         if (use_pairs == 1)
3070             numaAddNumber(na, val + 1);
3071     }
3072 
3073     return na;
3074 }
3075