1 /*====================================================================*
2  -  Copyright (C) 2001 Leptonica.  All rights reserved.
3  -
4  -  Redistribution and use in source and binary forms, with or without
5  -  modification, are permitted provided that the following conditions
6  -  are met:
7  -  1. Redistributions of source code must retain the above copyright
8  -     notice, this list of conditions and the following disclaimer.
9  -  2. Redistributions in binary form must reproduce the above
10  -     copyright notice, this list of conditions and the following
11  -     disclaimer in the documentation and/or other materials
12  -     provided with the distribution.
13  -
14  -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15  -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16  -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17  -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
18  -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23  -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *====================================================================*/
26 
27 /*!
28  * \file  sarray1.c
29  * <pre>
30  *
31  *      Create/Destroy/Copy
32  *          SARRAY    *sarrayCreate()
33  *          SARRAY    *sarrayCreateInitialized()
34  *          SARRAY    *sarrayCreateWordsFromString()
35  *          SARRAY    *sarrayCreateLinesFromString()
36  *          void      *sarrayDestroy()
37  *          SARRAY    *sarrayCopy()
38  *          SARRAY    *sarrayClone()
39  *
40  *      Add/Remove string
41  *          l_int32    sarrayAddString()
42  *          static l_int32  sarrayExtendArray()
43  *          char      *sarrayRemoveString()
44  *          l_int32    sarrayReplaceString()
45  *          l_int32    sarrayClear()
46  *
47  *      Accessors
48  *          l_int32    sarrayGetCount()
49  *          char     **sarrayGetArray()
50  *          char      *sarrayGetString()
51  *          l_int32    sarrayGetRefcount()
52  *          l_int32    sarrayChangeRefcount()
53  *
54  *      Conversion back to string
55  *          char      *sarrayToString()
56  *          char      *sarrayToStringRange()
57  *
58  *      Join 2 sarrays
59  *          l_int32    sarrayJoin()
60  *          l_int32    sarrayAppendRange()
61  *
62  *      Pad an sarray to be the same size as another sarray
63  *          l_int32    sarrayPadToSameSize()
64  *
65  *      Convert word sarray to (formatted) line sarray
66  *          SARRAY    *sarrayConvertWordsToLines()
67  *
68  *      Split string on separator list
69  *          SARRAY    *sarraySplitString()
70  *
71  *      Filter sarray
72  *          SARRAY    *sarraySelectBySubstring()
73  *          SARRAY    *sarraySelectByRange()
74  *          l_int32    sarrayParseRange()
75  *
76  *      Serialize for I/O
77  *          SARRAY    *sarrayRead()
78  *          SARRAY    *sarrayReadStream()
79  *          SARRAY    *sarrayReadMem()
80  *          l_int32    sarrayWrite()
81  *          l_int32    sarrayWriteStream()
82  *          l_int32    sarrayWriteMem()
83  *          l_int32    sarrayAppend()
84  *
85  *      Directory filenames
86  *          SARRAY    *getNumberedPathnamesInDirectory()
87  *          SARRAY    *getSortedPathnamesInDirectory()
88  *          SARRAY    *convertSortedToNumberedPathnames()
89  *          SARRAY    *getFilenamesInDirectory()
90  *
91  *      These functions are important for efficient manipulation
92  *      of string data, and they have found widespread use in
93  *      leptonica.  For example:
94  *         (1) to generate text files: e.g., PostScript and PDF
95  *             wrappers around sets of images
96  *         (2) to parse text files: e.g., extracting prototypes
97  *             from the source to generate allheaders.h
98  *         (3) to generate code for compilation: e.g., the fast
99  *             dwa code for arbitrary structuring elements.
100  *
101  *      Comments on usage:
102  *
103  *          The user is responsible for correctly disposing of strings
104  *          that have been extracted from sarrays.  In the following,
105  *          "str_not_owned" means the returned handle does not own the string,
106  *          and "str_owned" means the returned handle owns the string.
107  *            - To extract a string from an Sarray in order to inspect it
108  *              or to make a copy of it later, get a handle to it:
109  *                  copyflag = L_NOCOPY.
110  *              In this case, you must neither free the string nor put it
111  *              directly in another array:
112  *                 str-not-owned = sarrayGetString(sa, index, L_NOCOPY);
113  *            - To extract a copy of a string from an Sarray, use:
114  *                 str-owned = sarrayGetString(sa, index, L_COPY);
115  *            ~ To insert a string that is in one array into another
116  *              array (always leaving the first array intact), there are
117  *              two options:
118  *                 (1) use copyflag = L_COPY to make an immediate copy,
119  *                     which you then add to the second array by insertion:
120  *                       str-owned = sarrayGetString(sa, index, L_COPY);
121  *                       sarrayAddString(sa, str-owned, L_INSERT);
122  *                 (2) use copyflag = L_NOCOPY to get another handle to
123  *                     the string; you then add a copy of it to the
124  *                     second string array:
125  *                       str-not-owned = sarrayGetString(sa, index, L_NOCOPY);
126  *                       sarrayAddString(sa, str-not-owned, L_COPY).
127  *              sarrayAddString() transfers ownership to the Sarray, so never
128  *              use L_INSERT if the string is owned by another array.
129  *
130  *              In all cases, when you use copyflag = L_COPY to extract
131  *              a string from an array, you must either free it
132  *              or insert it in an array that will be freed later.
133  * </pre>
134  */
135 
136 #include <string.h>
137 #ifndef _WIN32
138 #include <dirent.h>     /* unix only */
139 #include <sys/stat.h>
140 #include <limits.h>  /* needed for realpath() */
141 #include <stdlib.h>  /* needed for realpath() */
142 #endif  /* ! _WIN32 */
143 #include "allheaders.h"
144 
145 static const l_int32  INITIAL_PTR_ARRAYSIZE = 50;     /* n'importe quoi */
146 static const l_int32  L_BUF_SIZE = 512;
147 
148     /* Static functions */
149 static l_int32 sarrayExtendArray(SARRAY *sa);
150 
151 
152 /*--------------------------------------------------------------------------*
153  *                   String array create/destroy/copy/extend                *
154  *--------------------------------------------------------------------------*/
155 /*!
156  * \brief   sarrayCreate()
157  *
158  * \param[in]    n size of string ptr array to be alloc'd;
159  *               use 0 for default
160  * \return  sarray, or NULL on error
161  */
162 SARRAY *
sarrayCreate(l_int32 n)163 sarrayCreate(l_int32  n)
164 {
165 SARRAY  *sa;
166 
167     PROCNAME("sarrayCreate");
168 
169     if (n <= 0)
170         n = INITIAL_PTR_ARRAYSIZE;
171 
172     sa = (SARRAY *)LEPT_CALLOC(1, sizeof(SARRAY));
173     if ((sa->array = (char **)LEPT_CALLOC(n, sizeof(char *))) == NULL) {
174         sarrayDestroy(&sa);
175         return (SARRAY *)ERROR_PTR("ptr array not made", procName, NULL);
176     }
177 
178     sa->nalloc = n;
179     sa->n = 0;
180     sa->refcount = 1;
181     return sa;
182 }
183 
184 
185 /*!
186  * \brief   sarrayCreateInitialized()
187  *
188  * \param[in]    n size of string ptr array to be alloc'd
189  * \param[in]    initstr string to be initialized on the full array
190  * \return  sarray, or NULL on error
191  */
192 SARRAY *
sarrayCreateInitialized(l_int32 n,char * initstr)193 sarrayCreateInitialized(l_int32  n,
194                         char    *initstr)
195 {
196 l_int32  i;
197 SARRAY  *sa;
198 
199     PROCNAME("sarrayCreateInitialized");
200 
201     if (n <= 0)
202         return (SARRAY *)ERROR_PTR("n must be > 0", procName, NULL);
203     if (!initstr)
204         return (SARRAY *)ERROR_PTR("initstr not defined", procName, NULL);
205 
206     sa = sarrayCreate(n);
207     for (i = 0; i < n; i++)
208         sarrayAddString(sa, initstr, L_COPY);
209     return sa;
210 }
211 
212 
213 /*!
214  * \brief   sarrayCreateWordsFromString()
215  *
216  * \param[in]    string
217  * \return  sarray, or NULL on error
218  *
219  * <pre>
220  * Notes:
221  *      (1) This finds the number of word substrings, creates an sarray
222  *          of this size, and puts copies of each substring into the sarray.
223  * </pre>
224  */
225 SARRAY *
sarrayCreateWordsFromString(const char * string)226 sarrayCreateWordsFromString(const char  *string)
227 {
228 char     separators[] = " \n\t";
229 l_int32  i, nsub, size, inword;
230 SARRAY  *sa;
231 
232     PROCNAME("sarrayCreateWordsFromString");
233 
234     if (!string)
235         return (SARRAY *)ERROR_PTR("textstr not defined", procName, NULL);
236 
237         /* Find the number of words */
238     size = strlen(string);
239     nsub = 0;
240     inword = FALSE;
241     for (i = 0; i < size; i++) {
242         if (inword == FALSE &&
243            (string[i] != ' ' && string[i] != '\t' && string[i] != '\n')) {
244            inword = TRUE;
245            nsub++;
246         } else if (inword == TRUE &&
247            (string[i] == ' ' || string[i] == '\t' || string[i] == '\n')) {
248            inword = FALSE;
249         }
250     }
251 
252     if ((sa = sarrayCreate(nsub)) == NULL)
253         return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
254     sarraySplitString(sa, string, separators);
255 
256     return sa;
257 }
258 
259 
260 /*!
261  * \brief   sarrayCreateLinesFromString()
262  *
263  * \param[in]    string
264  * \param[in]    blankflag  0 to exclude blank lines; 1 to include
265  * \return  sarray, or NULL on error
266  *
267  * <pre>
268  * Notes:
269  *      (1) This finds the number of line substrings, each of which
270  *          ends with a newline, and puts a copy of each substring
271  *          in a new sarray.
272  *      (2) The newline characters are removed from each substring.
273  * </pre>
274  */
275 SARRAY *
sarrayCreateLinesFromString(const char * string,l_int32 blankflag)276 sarrayCreateLinesFromString(const char  *string,
277                             l_int32      blankflag)
278 {
279 l_int32  i, nsub, size, startptr;
280 char    *cstring, *substring;
281 SARRAY  *sa;
282 
283     PROCNAME("sarrayCreateLinesFromString");
284 
285     if (!string)
286         return (SARRAY *)ERROR_PTR("textstr not defined", procName, NULL);
287 
288         /* Find the number of lines */
289     size = strlen(string);
290     nsub = 0;
291     for (i = 0; i < size; i++) {
292         if (string[i] == '\n')
293             nsub++;
294     }
295 
296     if ((sa = sarrayCreate(nsub)) == NULL)
297         return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
298 
299     if (blankflag) {  /* keep blank lines as null strings */
300             /* Make a copy for munging */
301         if ((cstring = stringNew(string)) == NULL) {
302             sarrayDestroy(&sa);
303             return (SARRAY *)ERROR_PTR("cstring not made", procName, NULL);
304         }
305             /* We'll insert nulls like strtok */
306         startptr = 0;
307         for (i = 0; i < size; i++) {
308             if (cstring[i] == '\n') {
309                 cstring[i] = '\0';
310                 if (i > 0 && cstring[i - 1] == '\r')
311                     cstring[i - 1] = '\0';  /* also remove Windows CR */
312                 if ((substring = stringNew(cstring + startptr)) == NULL) {
313                     sarrayDestroy(&sa);
314                     LEPT_FREE(cstring);
315                     return (SARRAY *)ERROR_PTR("substring not made",
316                                                 procName, NULL);
317                 }
318                 sarrayAddString(sa, substring, L_INSERT);
319 /*                fprintf(stderr, "substring = %s\n", substring); */
320                 startptr = i + 1;
321             }
322         }
323         if (startptr < size) {  /* no newline at end of last line */
324             if ((substring = stringNew(cstring + startptr)) == NULL) {
325                 sarrayDestroy(&sa);
326                 LEPT_FREE(cstring);
327                 return (SARRAY *)ERROR_PTR("substring not made",
328                                            procName, NULL);
329             }
330             sarrayAddString(sa, substring, L_INSERT);
331 /*            fprintf(stderr, "substring = %s\n", substring); */
332         }
333         LEPT_FREE(cstring);
334     } else {  /* remove blank lines; use strtok */
335         sarraySplitString(sa, string, "\r\n");
336     }
337 
338     return sa;
339 }
340 
341 
342 /*!
343  * \brief   sarrayDestroy()
344  *
345  * \param[in,out]   psa to be nulled
346  * \return  void
347  *
348  * <pre>
349  * Notes:
350  *      (1) Decrements the ref count and, if 0, destroys the sarray.
351  *      (2) Always nulls the input ptr.
352  * </pre>
353  */
354 void
sarrayDestroy(SARRAY ** psa)355 sarrayDestroy(SARRAY  **psa)
356 {
357 l_int32  i;
358 SARRAY  *sa;
359 
360     PROCNAME("sarrayDestroy");
361 
362     if (psa == NULL) {
363         L_WARNING("ptr address is NULL!\n", procName);
364         return;
365     }
366     if ((sa = *psa) == NULL)
367         return;
368 
369     sarrayChangeRefcount(sa, -1);
370     if (sarrayGetRefcount(sa) <= 0) {
371         if (sa->array) {
372             for (i = 0; i < sa->n; i++) {
373                 if (sa->array[i])
374                     LEPT_FREE(sa->array[i]);
375             }
376             LEPT_FREE(sa->array);
377         }
378         LEPT_FREE(sa);
379     }
380 
381     *psa = NULL;
382     return;
383 }
384 
385 
386 /*!
387  * \brief   sarrayCopy()
388  *
389  * \param[in]    sa string array
390  * \return  copy of sarray, or NULL on error
391  */
392 SARRAY *
sarrayCopy(SARRAY * sa)393 sarrayCopy(SARRAY  *sa)
394 {
395 l_int32  i;
396 SARRAY  *csa;
397 
398     PROCNAME("sarrayCopy");
399 
400     if (!sa)
401         return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);
402 
403     if ((csa = sarrayCreate(sa->nalloc)) == NULL)
404         return (SARRAY *)ERROR_PTR("csa not made", procName, NULL);
405 
406     for (i = 0; i < sa->n; i++)
407         sarrayAddString(csa, sa->array[i], L_COPY);
408 
409     return csa;
410 }
411 
412 
413 /*!
414  * \brief   sarrayClone()
415  *
416  * \param[in]    sa string array
417  * \return  ptr to same sarray, or NULL on error
418  */
419 SARRAY *
sarrayClone(SARRAY * sa)420 sarrayClone(SARRAY  *sa)
421 {
422     PROCNAME("sarrayClone");
423 
424     if (!sa)
425         return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);
426     sarrayChangeRefcount(sa, 1);
427     return sa;
428 }
429 
430 
431 /*!
432  * \brief   sarrayAddString()
433  *
434  * \param[in]    sa string array
435  * \param[in]    string  string to be added
436  * \param[in]    copyflag  L_INSERT, L_NOCOPY or L_COPY
437  * \return  0 if OK, 1 on error
438  *
439  * <pre>
440  * Notes:
441  *      (1) See usage comments at the top of this file.  L_INSERT is
442  *          equivalent to L_NOCOPY.
443  * </pre>
444  */
445 l_int32
sarrayAddString(SARRAY * sa,char * string,l_int32 copyflag)446 sarrayAddString(SARRAY  *sa,
447                 char    *string,
448                 l_int32  copyflag)
449 {
450 l_int32  n;
451 
452     PROCNAME("sarrayAddString");
453 
454     if (!sa)
455         return ERROR_INT("sa not defined", procName, 1);
456     if (!string)
457         return ERROR_INT("string not defined", procName, 1);
458     if (copyflag != L_INSERT && copyflag != L_NOCOPY && copyflag != L_COPY)
459         return ERROR_INT("invalid copyflag", procName, 1);
460 
461     n = sarrayGetCount(sa);
462     if (n >= sa->nalloc)
463         sarrayExtendArray(sa);
464 
465     if (copyflag == L_COPY)
466         sa->array[n] = stringNew(string);
467     else  /* L_INSERT or L_NOCOPY */
468         sa->array[n] = string;
469     sa->n++;
470 
471     return 0;
472 }
473 
474 
475 /*!
476  * \brief   sarrayExtendArray()
477  *
478  * \param[in]    sa string array
479  * \return  0 if OK, 1 on error
480  */
481 static l_int32
sarrayExtendArray(SARRAY * sa)482 sarrayExtendArray(SARRAY  *sa)
483 {
484     PROCNAME("sarrayExtendArray");
485 
486     if (!sa)
487         return ERROR_INT("sa not defined", procName, 1);
488 
489     if ((sa->array = (char **)reallocNew((void **)&sa->array,
490                               sizeof(char *) * sa->nalloc,
491                               2 * sizeof(char *) * sa->nalloc)) == NULL)
492             return ERROR_INT("new ptr array not returned", procName, 1);
493 
494     sa->nalloc *= 2;
495     return 0;
496 }
497 
498 
499 /*!
500  * \brief   sarrayRemoveString()
501  *
502  * \param[in]    sa string array
503  * \param[in]    index of string within sarray
504  * \return  removed string, or NULL on error
505  */
506 char *
sarrayRemoveString(SARRAY * sa,l_int32 index)507 sarrayRemoveString(SARRAY  *sa,
508                    l_int32  index)
509 {
510 char    *string;
511 char   **array;
512 l_int32  i, n, nalloc;
513 
514     PROCNAME("sarrayRemoveString");
515 
516     if (!sa)
517         return (char *)ERROR_PTR("sa not defined", procName, NULL);
518 
519     if ((array = sarrayGetArray(sa, &nalloc, &n)) == NULL)
520         return (char *)ERROR_PTR("array not returned", procName, NULL);
521 
522     if (index < 0 || index >= n)
523         return (char *)ERROR_PTR("array index out of bounds", procName, NULL);
524 
525     string = array[index];
526 
527         /* If removed string is not at end of array, shift
528          * to fill in, maintaining original ordering.
529          * Note: if we didn't care about the order, we could
530          * put the last string array[n - 1] directly into the hole.  */
531     for (i = index; i < n - 1; i++)
532         array[i] = array[i + 1];
533 
534     sa->n--;
535     return string;
536 }
537 
538 
539 /*!
540  * \brief   sarrayReplaceString()
541  *
542  * \param[in]    sa string array
543  * \param[in]    index of string within sarray to be replaced
544  * \param[in]    newstr string to replace existing one
545  * \param[in]    copyflag  L_INSERT, L_COPY
546  * \return  0 if OK, 1 on error
547  *
548  * <pre>
549  * Notes:
550  *      (1) This destroys an existing string and replaces it with
551  *          the new string or a copy of it.
552  *      (2) By design, an sarray is always compacted, so there are
553  *          never any holes (null ptrs) in the ptr array up to the
554  *          current count.
555  * </pre>
556  */
557 l_int32
sarrayReplaceString(SARRAY * sa,l_int32 index,char * newstr,l_int32 copyflag)558 sarrayReplaceString(SARRAY  *sa,
559                     l_int32  index,
560                     char    *newstr,
561                     l_int32  copyflag)
562 {
563 char    *str;
564 l_int32  n;
565 
566     PROCNAME("sarrayReplaceString");
567 
568     if (!sa)
569         return ERROR_INT("sa not defined", procName, 1);
570     n = sarrayGetCount(sa);
571     if (index < 0 || index >= n)
572         return ERROR_INT("array index out of bounds", procName, 1);
573     if (!newstr)
574         return ERROR_INT("newstr not defined", procName, 1);
575     if (copyflag != L_INSERT && copyflag != L_COPY)
576         return ERROR_INT("invalid copyflag", procName, 1);
577 
578     LEPT_FREE(sa->array[index]);
579     if (copyflag == L_INSERT)
580         str = newstr;
581     else  /* L_COPY */
582         str = stringNew(newstr);
583     sa->array[index] = str;
584     return 0;
585 }
586 
587 
588 /*!
589  * \brief   sarrayClear()
590  *
591  * \param[in]    sa string array
592  * \return  0 if OK; 1 on error
593  */
594 l_int32
sarrayClear(SARRAY * sa)595 sarrayClear(SARRAY  *sa)
596 {
597 l_int32  i;
598 
599     PROCNAME("sarrayClear");
600 
601     if (!sa)
602         return ERROR_INT("sa not defined", procName, 1);
603     for (i = 0; i < sa->n; i++) {  /* free strings and null ptrs */
604         LEPT_FREE(sa->array[i]);
605         sa->array[i] = NULL;
606     }
607     sa->n = 0;
608     return 0;
609 }
610 
611 
612 /*----------------------------------------------------------------------*
613  *                               Accessors                              *
614  *----------------------------------------------------------------------*/
615 /*!
616  * \brief   sarrayGetCount()
617  *
618  * \param[in]    sa string array
619  * \return  count, or 0 if no strings or on error
620  */
621 l_int32
sarrayGetCount(SARRAY * sa)622 sarrayGetCount(SARRAY  *sa)
623 {
624     PROCNAME("sarrayGetCount");
625 
626     if (!sa)
627         return ERROR_INT("sa not defined", procName, 0);
628     return sa->n;
629 }
630 
631 
632 /*!
633  * \brief   sarrayGetArray()
634  *
635  * \param[in]    sa string array
636  * \param[out]   pnalloc  [optional] number allocated string ptrs
637  * \param[out]   pn  [optional] number allocated strings
638  * \return  ptr to string array, or NULL on error
639  *
640  * <pre>
641  * Notes:
642  *      (1) Caution: the returned array is not a copy, so caller
643  *          must not destroy it!
644  * </pre>
645  */
646 char **
sarrayGetArray(SARRAY * sa,l_int32 * pnalloc,l_int32 * pn)647 sarrayGetArray(SARRAY   *sa,
648                l_int32  *pnalloc,
649                l_int32  *pn)
650 {
651 char  **array;
652 
653     PROCNAME("sarrayGetArray");
654 
655     if (!sa)
656         return (char **)ERROR_PTR("sa not defined", procName, NULL);
657 
658     array = sa->array;
659     if (pnalloc) *pnalloc = sa->nalloc;
660     if (pn) *pn = sa->n;
661 
662     return array;
663 }
664 
665 
666 /*!
667  * \brief   sarrayGetString()
668  *
669  * \param[in]    sa string array
670  * \param[in]    index   to the index-th string
671  * \param[in]    copyflag  L_NOCOPY or L_COPY
672  * \return  string, or NULL on error
673  *
674  * <pre>
675  * Notes:
676  *      (1) See usage comments at the top of this file.
677  *      (2) To get a pointer to the string itself, use L_NOCOPY.
678  *          To get a copy of the string, use L_COPY.
679  * </pre>
680  */
681 char *
sarrayGetString(SARRAY * sa,l_int32 index,l_int32 copyflag)682 sarrayGetString(SARRAY  *sa,
683                 l_int32  index,
684                 l_int32  copyflag)
685 {
686     PROCNAME("sarrayGetString");
687 
688     if (!sa)
689         return (char *)ERROR_PTR("sa not defined", procName, NULL);
690     if (index < 0 || index >= sa->n)
691         return (char *)ERROR_PTR("index not valid", procName, NULL);
692     if (copyflag != L_NOCOPY && copyflag != L_COPY)
693         return (char *)ERROR_PTR("invalid copyflag", procName, NULL);
694 
695     if (copyflag == L_NOCOPY)
696         return sa->array[index];
697     else  /* L_COPY */
698         return stringNew(sa->array[index]);
699 }
700 
701 
702 /*!
703  * \brief   sarrayGetRefCount()
704  *
705  * \param[in]    sa string array
706  * \return  refcount, or UNDEF on error
707  */
708 l_int32
sarrayGetRefcount(SARRAY * sa)709 sarrayGetRefcount(SARRAY  *sa)
710 {
711     PROCNAME("sarrayGetRefcount");
712 
713     if (!sa)
714         return ERROR_INT("sa not defined", procName, UNDEF);
715     return sa->refcount;
716 }
717 
718 
719 /*!
720  * \brief   sarrayChangeRefCount()
721  *
722  * \param[in]    sa string array
723  * \param[in]    delta change to be applied
724  * \return  0 if OK, 1 on error
725  */
726 l_int32
sarrayChangeRefcount(SARRAY * sa,l_int32 delta)727 sarrayChangeRefcount(SARRAY  *sa,
728                      l_int32  delta)
729 {
730     PROCNAME("sarrayChangeRefcount");
731 
732     if (!sa)
733         return ERROR_INT("sa not defined", procName, UNDEF);
734     sa->refcount += delta;
735     return 0;
736 }
737 
738 
739 /*----------------------------------------------------------------------*
740  *                      Conversion to string                           *
741  *----------------------------------------------------------------------*/
742 /*!
743  * \brief   sarrayToString()
744  *
745  * \param[in]    sa string array
746  * \param[in]    addnlflag flag: 0 adds nothing to each substring
747  *                               1 adds '\n' to each substring
748  *                               2 adds ' ' to each substring
749  * \return  dest string, or NULL on error
750  *
751  * <pre>
752  * Notes:
753  *      (1) Concatenates all the strings in the sarray, preserving
754  *          all white space.
755  *      (2) If addnlflag != 0, adds either a '\n' or a ' ' after
756  *          each substring.
757  *      (3) This function was NOT implemented as:
758  *            for (i = 0; i < n; i++)
759  *                     strcat(dest, sarrayGetString(sa, i, L_NOCOPY));
760  *          Do you see why?
761  * </pre>
762  */
763 char *
sarrayToString(SARRAY * sa,l_int32 addnlflag)764 sarrayToString(SARRAY  *sa,
765                l_int32  addnlflag)
766 {
767     PROCNAME("sarrayToString");
768 
769     if (!sa)
770         return (char *)ERROR_PTR("sa not defined", procName, NULL);
771 
772     return sarrayToStringRange(sa, 0, 0, addnlflag);
773 }
774 
775 
776 /*!
777  * \brief   sarrayToStringRange()
778  *
779  * \param[in]   sa string array
780  * \param[in]   first  index of first string to use; starts with 0
781  * \param[in]   nstrings number of strings to append into the result; use
782  *                       0 to append to the end of the sarray
783  * \param[in]   addnlflag flag: 0 adds nothing to each substring
784  *                              1 adds '\n' to each substring
785  *                              2 adds ' ' to each substring
786  * \return  dest string, or NULL on error
787  *
788  * <pre>
789  * Notes:
790  *      (1) Concatenates the specified strings inthe sarray, preserving
791  *          all white space.
792  *      (2) If addnlflag != 0, adds either a '\n' or a ' ' after
793  *          each substring.
794  *      (3) If the sarray is empty, this returns a string with just
795  *          the character corresponding to %addnlflag.
796  * </pre>
797  */
798 char *
sarrayToStringRange(SARRAY * sa,l_int32 first,l_int32 nstrings,l_int32 addnlflag)799 sarrayToStringRange(SARRAY  *sa,
800                     l_int32  first,
801                     l_int32  nstrings,
802                     l_int32  addnlflag)
803 {
804 char    *dest, *src, *str;
805 l_int32  n, i, last, size, index, len;
806 
807     PROCNAME("sarrayToStringRange");
808 
809     if (!sa)
810         return (char *)ERROR_PTR("sa not defined", procName, NULL);
811     if (addnlflag != 0 && addnlflag != 1 && addnlflag != 2)
812         return (char *)ERROR_PTR("invalid addnlflag", procName, NULL);
813 
814     n = sarrayGetCount(sa);
815 
816         /* Empty sa; return char corresponding to addnlflag only */
817     if (n == 0) {
818         if (first == 0) {
819             if (addnlflag == 0)
820                 return stringNew("");
821             if (addnlflag == 1)
822                 return stringNew("\n");
823             else  /* addnlflag == 2) */
824                 return stringNew(" ");
825         } else {
826             return (char *)ERROR_PTR("first not valid", procName, NULL);
827         }
828     }
829 
830     if (first < 0 || first >= n)
831         return (char *)ERROR_PTR("first not valid", procName, NULL);
832     if (nstrings == 0 || (nstrings > n - first))
833         nstrings = n - first;  /* no overflow */
834     last = first + nstrings - 1;
835 
836     size = 0;
837     for (i = first; i <= last; i++) {
838         if ((str = sarrayGetString(sa, i, L_NOCOPY)) == NULL)
839             return (char *)ERROR_PTR("str not found", procName, NULL);
840         size += strlen(str) + 2;
841     }
842 
843     if ((dest = (char *)LEPT_CALLOC(size + 1, sizeof(char))) == NULL)
844         return (char *)ERROR_PTR("dest not made", procName, NULL);
845 
846     index = 0;
847     for (i = first; i <= last; i++) {
848         src = sarrayGetString(sa, i, L_NOCOPY);
849         len = strlen(src);
850         memcpy(dest + index, src, len);
851         index += len;
852         if (addnlflag == 1) {
853             dest[index] = '\n';
854             index++;
855         } else if (addnlflag == 2) {
856             dest[index] = ' ';
857             index++;
858         }
859     }
860 
861     return dest;
862 }
863 
864 
865 /*----------------------------------------------------------------------*
866  *                           Join 2 sarrays                             *
867  *----------------------------------------------------------------------*/
868 /*!
869  * \brief   sarrayJoin()
870  *
871  * \param[in]    sa1  to be added to
872  * \param[in]    sa2  append to sa1
873  * \return  0 if OK, 1 on error
874  *
875  * <pre>
876  * Notes:
877  *      (1) Copies of the strings in sarray2 are added to sarray1.
878  * </pre>
879  */
880 l_int32
sarrayJoin(SARRAY * sa1,SARRAY * sa2)881 sarrayJoin(SARRAY  *sa1,
882            SARRAY  *sa2)
883 {
884 char    *str;
885 l_int32  n, i;
886 
887     PROCNAME("sarrayJoin");
888 
889     if (!sa1)
890         return ERROR_INT("sa1 not defined", procName, 1);
891     if (!sa2)
892         return ERROR_INT("sa2 not defined", procName, 1);
893 
894     n = sarrayGetCount(sa2);
895     for (i = 0; i < n; i++) {
896         str = sarrayGetString(sa2, i, L_NOCOPY);
897         sarrayAddString(sa1, str, L_COPY);
898     }
899 
900     return 0;
901 }
902 
903 
904 /*!
905  * \brief   sarrayAppendRange()
906  *
907  * \param[in]    sa1  to be added to
908  * \param[in]    sa2  append specified range of strings in sa2 to sa1
909  * \param[in]    start index of first string of sa2 to append
910  * \param[in]    end index of last string of sa2 to append; -1 to end of array
911  * \return  0 if OK, 1 on error
912  *
913  * <pre>
914  * Notes:
915  *      (1) Copies of the strings in sarray2 are added to sarray1.
916  *      (2) The [start ... end] range is truncated if necessary.
917  *      (3) Use end == -1 to append to the end of sa2.
918  * </pre>
919  */
920 l_int32
sarrayAppendRange(SARRAY * sa1,SARRAY * sa2,l_int32 start,l_int32 end)921 sarrayAppendRange(SARRAY  *sa1,
922                   SARRAY  *sa2,
923                   l_int32  start,
924                   l_int32  end)
925 {
926 char    *str;
927 l_int32  n, i;
928 
929     PROCNAME("sarrayAppendRange");
930 
931     if (!sa1)
932         return ERROR_INT("sa1 not defined", procName, 1);
933     if (!sa2)
934         return ERROR_INT("sa2 not defined", procName, 1);
935 
936     if (start < 0)
937         start = 0;
938     n = sarrayGetCount(sa2);
939     if (end < 0 || end >= n)
940         end = n - 1;
941     if (start > end)
942         return ERROR_INT("start > end", procName, 1);
943 
944     for (i = start; i <= end; i++) {
945         str = sarrayGetString(sa2, i, L_NOCOPY);
946         sarrayAddString(sa1, str, L_COPY);
947     }
948 
949     return 0;
950 }
951 
952 
953 /*----------------------------------------------------------------------*
954  *          Pad an sarray to be the same size as another sarray         *
955  *----------------------------------------------------------------------*/
956 /*!
957  * \brief   sarrayPadToSameSize()
958  *
959  * \param[in]    sa1, sa2
960  * \param[in]    padstring
961  * \return  0 if OK, 1 on error
962  *
963  * <pre>
964  * Notes:
965  *      (1) If two sarrays have different size, this adds enough
966  *          instances of %padstring to the smaller so that they are
967  *          the same size.  It is useful when two or more sarrays
968  *          are being sequenced in parallel, and it is necessary to
969  *          find a valid string at each index.
970  * </pre>
971  */
972 l_int32
sarrayPadToSameSize(SARRAY * sa1,SARRAY * sa2,char * padstring)973 sarrayPadToSameSize(SARRAY  *sa1,
974                     SARRAY  *sa2,
975                     char    *padstring)
976 {
977 l_int32  i, n1, n2;
978 
979     PROCNAME("sarrayPadToSameSize");
980 
981     if (!sa1 || !sa2)
982         return ERROR_INT("both sa1 and sa2 not defined", procName, 1);
983 
984     n1 = sarrayGetCount(sa1);
985     n2 = sarrayGetCount(sa2);
986     if (n1 < n2) {
987         for (i = n1; i < n2; i++)
988             sarrayAddString(sa1, padstring, L_COPY);
989     } else if (n1 > n2) {
990         for (i = n2; i < n1; i++)
991             sarrayAddString(sa2, padstring, L_COPY);
992     }
993 
994     return 0;
995 }
996 
997 
998 /*----------------------------------------------------------------------*
999  *                   Convert word sarray to line sarray                 *
1000  *----------------------------------------------------------------------*/
1001 /*!
1002  * \brief   sarrayConvertWordsToLines()
1003  *
1004  * \param[in]    sa  sa of individual words
1005  * \param[in]    linesize  max num of chars in each line
1006  * \return  saout sa of formatted lines, or NULL on error
1007  *
1008  *  This is useful for re-typesetting text to a specific maximum
1009  *  line length.  The individual words in the input sarray
1010  *  are concatenated into textlines.  An input word string of zero
1011  *  length is taken to be a paragraph separator.  Each time
1012  *  such a string is found, the current line is ended and
1013  *  a new line is also produced that contains just the
1014  *  string of zero length "".  When the output sarray
1015  *  of lines is eventually converted to a string with newlines
1016  *  typically appended to each line string, the empty
1017  *  strings are just converted to newlines, producing the visible
1018  *  paragraph separation.
1019  *
1020  *  What happens when a word is larger than linesize?
1021  *  We write it out as a single line anyway!  Words preceding
1022  *  or following this long word are placed on lines preceding
1023  *  or following the line with the long word.  Why this choice?
1024  *  Long "words" found in text documents are typically URLs, and
1025  *  it's often desirable not to put newlines in the middle of a URL.
1026  *  The text display program e.g., text editor will typically
1027  *  wrap the long "word" to fit in the window.
1028  */
1029 SARRAY *
sarrayConvertWordsToLines(SARRAY * sa,l_int32 linesize)1030 sarrayConvertWordsToLines(SARRAY  *sa,
1031                           l_int32  linesize)
1032 {
1033 char    *wd, *strl;
1034 char     emptystring[] = "";
1035 l_int32  n, i, len, totlen;
1036 SARRAY  *sal, *saout;
1037 
1038     PROCNAME("sarrayConvertWordsToLines");
1039 
1040     if (!sa)
1041         return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);
1042 
1043     saout = sarrayCreate(0);
1044     n = sarrayGetCount(sa);
1045     totlen = 0;
1046     sal = NULL;
1047     for (i = 0; i < n; i++) {
1048         if (!sal)
1049             sal = sarrayCreate(0);
1050         wd = sarrayGetString(sa, i, L_NOCOPY);
1051         len = strlen(wd);
1052         if (len == 0) {  /* end of paragraph: end line & insert blank line */
1053             if (totlen > 0) {
1054                 strl = sarrayToString(sal, 2);
1055                 sarrayAddString(saout, strl, L_INSERT);
1056             }
1057             sarrayAddString(saout, emptystring, L_COPY);
1058             sarrayDestroy(&sal);
1059             totlen = 0;
1060         } else if (totlen == 0 && len + 1 > linesize) {  /* long word! */
1061             sarrayAddString(saout, wd, L_COPY);  /* copy to one line */
1062         } else if (totlen + len + 1 > linesize) {  /* end line & start new */
1063             strl = sarrayToString(sal, 2);
1064             sarrayAddString(saout, strl, L_INSERT);
1065             sarrayDestroy(&sal);
1066             sal = sarrayCreate(0);
1067             sarrayAddString(sal, wd, L_COPY);
1068             totlen = len + 1;
1069         } else {  /* add to current line */
1070             sarrayAddString(sal, wd, L_COPY);
1071             totlen += len + 1;
1072         }
1073     }
1074     if (totlen > 0) {   /* didn't end with blank line; output last line */
1075         strl = sarrayToString(sal, 2);
1076         sarrayAddString(saout, strl, L_INSERT);
1077         sarrayDestroy(&sal);
1078     }
1079 
1080     return saout;
1081 }
1082 
1083 
1084 /*----------------------------------------------------------------------*
1085  *                    Split string on separator list                    *
1086  *----------------------------------------------------------------------*/
1087 /*
1088  *  sarraySplitString()
1089  *
1090  *      Input:  sa (to append to; typically empty initially)
1091  *              str (string to split; not changed)
1092  *              separators (characters that split input string)
1093  *      Return: 0 if OK, 1 on error.
1094  *
1095  *  Notes:
1096  *      (1) This uses strtokSafe().  See the notes there in utils.c.
1097  */
1098 l_int32
sarraySplitString(SARRAY * sa,const char * str,const char * separators)1099 sarraySplitString(SARRAY      *sa,
1100                   const char  *str,
1101                   const char  *separators)
1102 {
1103 char  *cstr, *substr, *saveptr;
1104 
1105     PROCNAME("sarraySplitString");
1106 
1107     if (!sa)
1108         return ERROR_INT("sa not defined", procName, 1);
1109     if (!str)
1110         return ERROR_INT("str not defined", procName, 1);
1111     if (!separators)
1112         return ERROR_INT("separators not defined", procName, 1);
1113 
1114     cstr = stringNew(str);  /* preserves const-ness of input str */
1115     substr = strtokSafe(cstr, separators, &saveptr);
1116     if (substr)
1117         sarrayAddString(sa, substr, L_INSERT);
1118     while ((substr = strtokSafe(NULL, separators, &saveptr)))
1119         sarrayAddString(sa, substr, L_INSERT);
1120     LEPT_FREE(cstr);
1121 
1122     return 0;
1123 }
1124 
1125 
1126 /*----------------------------------------------------------------------*
1127  *                              Filter sarray                           *
1128  *----------------------------------------------------------------------*/
1129 /*!
1130  * \brief   sarraySelectBySubstring()
1131  *
1132  * \param[in]    sain input sarray
1133  * \param[in]    substr [optional] substring for matching; can be NULL
1134  * \return  saout output sarray, filtered with substring or NULL on error
1135  *
1136  * <pre>
1137  * Notes:
1138  *      (1) This selects all strings in sain that have substr as a substring.
1139  *          Note that we can't use strncmp() because we're looking for
1140  *          a match to the substring anywhere within each filename.
1141  *      (2) If substr == NULL, returns a copy of the sarray.
1142  * </pre>
1143  */
1144 SARRAY *
sarraySelectBySubstring(SARRAY * sain,const char * substr)1145 sarraySelectBySubstring(SARRAY      *sain,
1146                         const char  *substr)
1147 {
1148 char    *str;
1149 l_int32  n, i, offset, found;
1150 SARRAY  *saout;
1151 
1152     PROCNAME("sarraySelectBySubstring");
1153 
1154     if (!sain)
1155         return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL);
1156 
1157     n = sarrayGetCount(sain);
1158     if (!substr || n == 0)
1159         return sarrayCopy(sain);
1160 
1161     saout = sarrayCreate(n);
1162     for (i = 0; i < n; i++) {
1163         str = sarrayGetString(sain, i, L_NOCOPY);
1164         arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
1165                           strlen(substr), &offset, &found);
1166         if (found)
1167             sarrayAddString(saout, str, L_COPY);
1168     }
1169 
1170     return saout;
1171 }
1172 
1173 
1174 /*!
1175  * \brief   sarraySelectByRange()
1176  *
1177  * \param[in]    sain input sarray
1178  * \param[in]    first index of first string to be selected
1179  * \param[in]    last index of last string to be selected; use 0 to go to the
1180  *                    end of the sarray
1181  * \return  saout output sarray, or NULL on error
1182  *
1183  * <pre>
1184  * Notes:
1185  *      (1) This makes %saout consisting of copies of all strings in %sain
1186  *          in the index set [first ... last].  Use %last == 0 to get all
1187  *          strings from %first to the last string in the sarray.
1188  * </pre>
1189  */
1190 SARRAY *
sarraySelectByRange(SARRAY * sain,l_int32 first,l_int32 last)1191 sarraySelectByRange(SARRAY  *sain,
1192                     l_int32  first,
1193                     l_int32  last)
1194 {
1195 char    *str;
1196 l_int32  n, i;
1197 SARRAY  *saout;
1198 
1199     PROCNAME("sarraySelectByRange");
1200 
1201     if (!sain)
1202         return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL);
1203     if (first < 0) first = 0;
1204     n = sarrayGetCount(sain);
1205     if (last <= 0) last = n - 1;
1206     if (last >= n) {
1207         L_WARNING("last > n - 1; setting to n - 1\n", procName);
1208         last = n - 1;
1209     }
1210     if (first > last)
1211         return (SARRAY *)ERROR_PTR("first must be >= last", procName, NULL);
1212 
1213     saout = sarrayCreate(0);
1214     for (i = first; i <= last; i++) {
1215         str = sarrayGetString(sain, i, L_COPY);
1216         sarrayAddString(saout, str, L_INSERT);
1217     }
1218 
1219     return saout;
1220 }
1221 
1222 
1223 /*!
1224  * \brief   sarrayParseRange()
1225  *
1226  * \param[in]    sa input sarray
1227  * \param[in]    start index to start range search
1228  * \param[out]  pactualstart index of actual start; may be > 'start'
1229  * \param[out]  pend index of end
1230  * \param[out]  pnewstart index of start of next range
1231  * \param[in]    substr substring for matching at beginning of string
1232  * \param[in]    loc byte offset within the string for the pattern; use
1233  *                   -1 if the location does not matter;
1234  * \return  0 if valid range found; 1 otherwise
1235  *
1236  * <pre>
1237  * Notes:
1238  *      (1) This finds the range of the next set of strings in SA,
1239  *          beginning the search at 'start', that does NOT have
1240  *          the substring 'substr' either at the indicated location
1241  *          in the string or anywhere in the string.  The input
1242  *          variable 'loc' is the specified offset within the string;
1243  *          use -1 to indicate 'anywhere in the string'.
1244  *      (2) Always check the return value to verify that a valid range
1245  *          was found.
1246  *      (3) If a valid range is not found, the values of actstart,
1247  *          end and newstart are all set to the size of sa.
1248  *      (4) If this is the last valid range, newstart returns the value n.
1249  *          In use, this should be tested before calling the function.
1250  *      (5) Usage example.  To find all the valid ranges in a file
1251  *          where the invalid lines begin with two dashes, copy each
1252  *          line in the file to a string in an sarray, and do:
1253  *             start = 0;
1254  *             while (!sarrayParseRange(sa, start, &actstart, &end, &start,
1255  *                    "--", 0))
1256  *                 fprintf(stderr, "start = %d, end = %d\n", actstart, end);
1257  * </pre>
1258  */
1259 l_int32
sarrayParseRange(SARRAY * sa,l_int32 start,l_int32 * pactualstart,l_int32 * pend,l_int32 * pnewstart,const char * substr,l_int32 loc)1260 sarrayParseRange(SARRAY      *sa,
1261                  l_int32      start,
1262                  l_int32     *pactualstart,
1263                  l_int32     *pend,
1264                  l_int32     *pnewstart,
1265                  const char  *substr,
1266                  l_int32      loc)
1267 {
1268 char    *str;
1269 l_int32  n, i, offset, found;
1270 
1271     PROCNAME("sarrayParseRange");
1272 
1273     if (!sa)
1274         return ERROR_INT("sa not defined", procName, 1);
1275     if (!pactualstart || !pend || !pnewstart)
1276         return ERROR_INT("not all range addresses defined", procName, 1);
1277     n = sarrayGetCount(sa);
1278     *pactualstart = *pend = *pnewstart = n;
1279     if (!substr)
1280         return ERROR_INT("substr not defined", procName, 1);
1281 
1282         /* Look for the first string without the marker */
1283     if (start < 0 || start >= n)
1284         return 1;
1285     for (i = start; i < n; i++) {
1286         str = sarrayGetString(sa, i, L_NOCOPY);
1287         arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
1288                           strlen(substr), &offset, &found);
1289         if (loc < 0) {
1290             if (!found) break;
1291         } else {
1292             if (!found || offset != loc) break;
1293         }
1294     }
1295     start = i;
1296     if (i == n)  /* couldn't get started */
1297         return 1;
1298 
1299         /* Look for the last string without the marker */
1300     *pactualstart = start;
1301     for (i = start + 1; i < n; i++) {
1302         str = sarrayGetString(sa, i, L_NOCOPY);
1303         arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
1304                           strlen(substr), &offset, &found);
1305         if (loc < 0) {
1306             if (found) break;
1307         } else {
1308             if (found && offset == loc) break;
1309         }
1310     }
1311     *pend = i - 1;
1312     start = i;
1313     if (i == n)  /* no further range */
1314         return 0;
1315 
1316         /* Look for the first string after *pend without the marker.
1317          * This will start the next run of strings, if it exists. */
1318     for (i = start; i < n; i++) {
1319         str = sarrayGetString(sa, i, L_NOCOPY);
1320         arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
1321                           strlen(substr), &offset, &found);
1322         if (loc < 0) {
1323             if (!found) break;
1324         } else {
1325             if (!found || offset != loc) break;
1326         }
1327     }
1328     if (i < n)
1329         *pnewstart = i;
1330 
1331     return 0;
1332 }
1333 
1334 
1335 /*----------------------------------------------------------------------*
1336  *                           Serialize for I/O                          *
1337  *----------------------------------------------------------------------*/
1338 /*!
1339  * \brief   sarrayRead()
1340  *
1341  * \param[in]    filename
1342  * \return  sarray, or NULL on error
1343  */
1344 SARRAY *
sarrayRead(const char * filename)1345 sarrayRead(const char  *filename)
1346 {
1347 FILE    *fp;
1348 SARRAY  *sa;
1349 
1350     PROCNAME("sarrayRead");
1351 
1352     if (!filename)
1353         return (SARRAY *)ERROR_PTR("filename not defined", procName, NULL);
1354 
1355     if ((fp = fopenReadStream(filename)) == NULL)
1356         return (SARRAY *)ERROR_PTR("stream not opened", procName, NULL);
1357     sa = sarrayReadStream(fp);
1358     fclose(fp);
1359     if (!sa)
1360         return (SARRAY *)ERROR_PTR("sa not read", procName, NULL);
1361     return sa;
1362 }
1363 
1364 
1365 /*!
1366  * \brief   sarrayReadStream()
1367  *
1368  * \param[in]    fp file stream
1369  * \return  sarray, or NULL on error
1370  *
1371  * <pre>
1372  * Notes:
1373  *      (1) We store the size of each string along with the string.
1374  *          The limit on the number of strings is 2^24.
1375  *          The limit on the size of any string is 2^30 bytes.
1376  *      (2) This allows a string to have embedded newlines.  By reading
1377  *          the entire string, as determined by its size, we are
1378  *          not affected by any number of embedded newlines.
1379  * </pre>
1380  */
1381 SARRAY *
sarrayReadStream(FILE * fp)1382 sarrayReadStream(FILE  *fp)
1383 {
1384 char    *stringbuf;
1385 l_int32  i, n, size, index, bufsize, version, ignore, success;
1386 SARRAY  *sa;
1387 
1388     PROCNAME("sarrayReadStream");
1389 
1390     if (!fp)
1391         return (SARRAY *)ERROR_PTR("stream not defined", procName, NULL);
1392 
1393     if (fscanf(fp, "\nSarray Version %d\n", &version) != 1)
1394         return (SARRAY *)ERROR_PTR("not an sarray file", procName, NULL);
1395     if (version != SARRAY_VERSION_NUMBER)
1396         return (SARRAY *)ERROR_PTR("invalid sarray version", procName, NULL);
1397     if (fscanf(fp, "Number of strings = %d\n", &n) != 1)
1398         return (SARRAY *)ERROR_PTR("error on # strings", procName, NULL);
1399     if (n > (1 << 24))
1400         return (SARRAY *)ERROR_PTR("more than 2^24 strings!", procName, NULL);
1401 
1402     success = TRUE;
1403     if ((sa = sarrayCreate(n)) == NULL)
1404         return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
1405     bufsize = L_BUF_SIZE + 1;
1406     stringbuf = (char *)LEPT_CALLOC(bufsize, sizeof(char));
1407 
1408     for (i = 0; i < n; i++) {
1409             /* Get the size of the stored string */
1410         if ((fscanf(fp, "%d[%d]:", &index, &size) != 2) || (size > (1 << 30))) {
1411             success = FALSE;
1412             L_ERROR("error on string size\n", procName);
1413             goto cleanup;
1414         }
1415             /* Expand the string buffer if necessary */
1416         if (size > bufsize - 5) {
1417             LEPT_FREE(stringbuf);
1418             bufsize = (l_int32)(1.5 * size);
1419             stringbuf = (char *)LEPT_CALLOC(bufsize, sizeof(char));
1420         }
1421             /* Read the stored string, plus leading spaces and trailing \n */
1422         if (fread(stringbuf, 1, size + 3, fp) != size + 3) {
1423             success = FALSE;
1424             L_ERROR("error reading string\n", procName);
1425             goto cleanup;
1426         }
1427             /* Remove the \n that was added by sarrayWriteStream() */
1428         stringbuf[size + 2] = '\0';
1429             /* Copy it in, skipping the 2 leading spaces */
1430         sarrayAddString(sa, stringbuf + 2, L_COPY);
1431     }
1432     ignore = fscanf(fp, "\n");
1433 
1434 cleanup:
1435     LEPT_FREE(stringbuf);
1436     if (!success) sarrayDestroy(&sa);
1437     return sa;
1438 }
1439 
1440 
1441 /*!
1442  * \brief   sarrayReadMem()
1443  *
1444  * \param[in]    data  serialization in ascii
1445  * \param[in]    size  of data; can use strlen to get it
1446  * \return  sarray, or NULL on error
1447  */
1448 SARRAY *
sarrayReadMem(const l_uint8 * data,size_t size)1449 sarrayReadMem(const l_uint8  *data,
1450               size_t          size)
1451 {
1452 FILE    *fp;
1453 SARRAY  *sa;
1454 
1455     PROCNAME("sarrayReadMem");
1456 
1457     if (!data)
1458         return (SARRAY *)ERROR_PTR("data not defined", procName, NULL);
1459     if ((fp = fopenReadFromMemory(data, size)) == NULL)
1460         return (SARRAY *)ERROR_PTR("stream not opened", procName, NULL);
1461 
1462     sa = sarrayReadStream(fp);
1463     fclose(fp);
1464     if (!sa) L_ERROR("sarray not read\n", procName);
1465     return sa;
1466 }
1467 
1468 
1469 /*!
1470  * \brief   sarrayWrite()
1471  *
1472  * \param[in]    filename
1473  * \param[in]    sa string array
1474  * \return  0 if OK; 1 on error
1475  */
1476 l_int32
sarrayWrite(const char * filename,SARRAY * sa)1477 sarrayWrite(const char  *filename,
1478             SARRAY      *sa)
1479 {
1480 l_int32  ret;
1481 FILE    *fp;
1482 
1483     PROCNAME("sarrayWrite");
1484 
1485     if (!filename)
1486         return ERROR_INT("filename not defined", procName, 1);
1487     if (!sa)
1488         return ERROR_INT("sa not defined", procName, 1);
1489 
1490     if ((fp = fopenWriteStream(filename, "w")) == NULL)
1491         return ERROR_INT("stream not opened", procName, 1);
1492     ret = sarrayWriteStream(fp, sa);
1493     fclose(fp);
1494     if (ret)
1495         return ERROR_INT("sa not written to stream", procName, 1);
1496     return 0;
1497 }
1498 
1499 
1500 /*!
1501  * \brief   sarrayWriteStream()
1502  *
1503  * \param[in]    fp file stream
1504  * \param[in]    sa string array
1505  * \return  0 if OK; 1 on error
1506  *
1507  * <pre>
1508  * Notes:
1509  *      (1) This appends a '\n' to each string, which is stripped
1510  *          off by sarrayReadStream().
1511  * </pre>
1512  */
1513 l_int32
sarrayWriteStream(FILE * fp,SARRAY * sa)1514 sarrayWriteStream(FILE    *fp,
1515                   SARRAY  *sa)
1516 {
1517 l_int32  i, n, len;
1518 
1519     PROCNAME("sarrayWriteStream");
1520 
1521     if (!fp)
1522         return ERROR_INT("stream not defined", procName, 1);
1523     if (!sa)
1524         return ERROR_INT("sa not defined", procName, 1);
1525 
1526     n = sarrayGetCount(sa);
1527     fprintf(fp, "\nSarray Version %d\n", SARRAY_VERSION_NUMBER);
1528     fprintf(fp, "Number of strings = %d\n", n);
1529     for (i = 0; i < n; i++) {
1530         len = strlen(sa->array[i]);
1531         fprintf(fp, "  %d[%d]:  %s\n", i, len, sa->array[i]);
1532     }
1533     fprintf(fp, "\n");
1534 
1535     return 0;
1536 }
1537 
1538 
1539 /*!
1540  * \brief   sarrayWriteMem()
1541  *
1542  * \param[out]   pdata data of serialized sarray; ascii
1543  * \param[out]   psize size of returned data
1544  * \param[in]    sa
1545  * \return  0 if OK, 1 on error
1546  *
1547  * <pre>
1548  * Notes:
1549  *      (1) Serializes a sarray in memory and puts the result in a buffer.
1550  * </pre>
1551  */
1552 l_int32
sarrayWriteMem(l_uint8 ** pdata,size_t * psize,SARRAY * sa)1553 sarrayWriteMem(l_uint8  **pdata,
1554                size_t    *psize,
1555                SARRAY    *sa)
1556 {
1557 l_int32  ret;
1558 FILE    *fp;
1559 
1560     PROCNAME("sarrayWriteMem");
1561 
1562     if (pdata) *pdata = NULL;
1563     if (psize) *psize = 0;
1564     if (!pdata)
1565         return ERROR_INT("&data not defined", procName, 1);
1566     if (!psize)
1567         return ERROR_INT("&size not defined", procName, 1);
1568     if (!sa)
1569         return ERROR_INT("sa not defined", procName, 1);
1570 
1571 #if HAVE_FMEMOPEN
1572     if ((fp = open_memstream((char **)pdata, psize)) == NULL)
1573         return ERROR_INT("stream not opened", procName, 1);
1574     ret = sarrayWriteStream(fp, sa);
1575 #else
1576     L_INFO("work-around: writing to a temp file\n", procName);
1577   #ifdef _WIN32
1578     if ((fp = fopenWriteWinTempfile()) == NULL)
1579         return ERROR_INT("tmpfile stream not opened", procName, 1);
1580   #else
1581     if ((fp = tmpfile()) == NULL)
1582         return ERROR_INT("tmpfile stream not opened", procName, 1);
1583   #endif  /* _WIN32 */
1584     ret = sarrayWriteStream(fp, sa);
1585     rewind(fp);
1586     *pdata = l_binaryReadStream(fp, psize);
1587 #endif  /* HAVE_FMEMOPEN */
1588     fclose(fp);
1589     return ret;
1590 }
1591 
1592 
1593 /*!
1594  * \brief   sarrayAppend()
1595  *
1596  * \param[in]    filename
1597  * \param[in]    sa
1598  * \return  0 if OK; 1 on error
1599  */
1600 l_int32
sarrayAppend(const char * filename,SARRAY * sa)1601 sarrayAppend(const char  *filename,
1602              SARRAY      *sa)
1603 {
1604 FILE  *fp;
1605 
1606     PROCNAME("sarrayAppend");
1607 
1608     if (!filename)
1609         return ERROR_INT("filename not defined", procName, 1);
1610     if (!sa)
1611         return ERROR_INT("sa not defined", procName, 1);
1612 
1613     if ((fp = fopenWriteStream(filename, "a")) == NULL)
1614         return ERROR_INT("stream not opened", procName, 1);
1615     if (sarrayWriteStream(fp, sa)) {
1616         fclose(fp);
1617         return ERROR_INT("sa not appended to stream", procName, 1);
1618     }
1619 
1620     fclose(fp);
1621     return 0;
1622 }
1623 
1624 
1625 /*---------------------------------------------------------------------*
1626  *                           Directory filenames                       *
1627  *---------------------------------------------------------------------*/
1628 /*!
1629  * \brief   getNumberedPathnamesInDirectory()
1630  *
1631  * \param[in]    dirname directory name
1632  * \param[in]    substr [optional] substring filter on filenames; can be NULL
1633  * \param[in]    numpre number of characters in name before number
1634  * \param[in]    numpost number of characters in name after the number,
1635  *                       up to a dot before an extension
1636  * \param[in]    maxnum only consider page numbers up to this value
1637  * \return  sarray of numbered pathnames, or NULL on error
1638  *
1639  * <pre>
1640  * Notes:
1641  *      (1) Returns the full pathnames of the numbered filenames in
1642  *          the directory.  The number in the filename is the index
1643  *          into the sarray.  For indices for which there are no filenames,
1644  *          an empty string ("") is placed into the sarray.
1645  *          This makes reading numbered files very simple.  For example,
1646  *          the image whose filename includes number N can be retrieved using
1647  *               pixReadIndexed(sa, N);
1648  *      (2) If %substr is not NULL, only filenames that contain
1649  *          the substring can be included.  If %substr is NULL,
1650  *          all matching filenames are used.
1651  *      (3) If no numbered files are found, it returns an empty sarray,
1652  *          with no initialized strings.
1653  *      (4) It is assumed that the page number is contained within
1654  *          the basename (the filename without directory or extension).
1655  *          %numpre is the number of characters in the basename
1656  *          preceding the actual page number; %numpost is the number
1657  *          following the page number, up to either the end of the
1658  *          basename or a ".", whichever comes first.
1659  *      (5) This is useful when all filenames contain numbers that are
1660  *          not necessarily consecutive.  0-padding is not required.
1661  *      (6) To use a O(n) matching algorithm, the largest page number
1662  *          is found and two internal arrays of this size are created.
1663  *          This maximum is constrained not to exceed %maxsum,
1664  *          to make sure that an unrealistically large number is not
1665  *          accidentally used to determine the array sizes.
1666  * </pre>
1667  */
1668 SARRAY *
getNumberedPathnamesInDirectory(const char * dirname,const char * substr,l_int32 numpre,l_int32 numpost,l_int32 maxnum)1669 getNumberedPathnamesInDirectory(const char  *dirname,
1670                                 const char  *substr,
1671                                 l_int32      numpre,
1672                                 l_int32      numpost,
1673                                 l_int32      maxnum)
1674 {
1675 l_int32  nfiles;
1676 SARRAY  *sa, *saout;
1677 
1678     PROCNAME("getNumberedPathnamesInDirectory");
1679 
1680     if (!dirname)
1681         return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
1682 
1683     if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
1684         return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
1685     if ((nfiles = sarrayGetCount(sa)) == 0) {
1686         sarrayDestroy(&sa);
1687         return sarrayCreate(1);
1688     }
1689 
1690     saout = convertSortedToNumberedPathnames(sa, numpre, numpost, maxnum);
1691     sarrayDestroy(&sa);
1692     return saout;
1693 }
1694 
1695 
1696 /*!
1697  * \brief   getSortedPathnamesInDirectory()
1698  *
1699  * \param[in]    dirname directory name
1700  * \param[in]    substr [optional] substring filter on filenames; can be NULL
1701  * \param[in]    first 0-based
1702  * \param[in]    nfiles use 0 for all to the end
1703  * \return  sarray of sorted pathnames, or NULL on error
1704  *
1705  * <pre>
1706  * Notes:
1707  *      (1) Use %substr to filter filenames in the directory.  If
1708  *          %substr == NULL, this takes all files.
1709  *      (2) The files in the directory, after optional filtering by
1710  *          the substring, are lexically sorted in increasing order.
1711  *          Use %first and %nfiles to select a contiguous set of files.
1712  *      (3) The full pathnames are returned for the requested sequence.
1713  *          If no files are found after filtering, returns an empty sarray.
1714  * </pre>
1715  */
1716 SARRAY *
getSortedPathnamesInDirectory(const char * dirname,const char * substr,l_int32 first,l_int32 nfiles)1717 getSortedPathnamesInDirectory(const char  *dirname,
1718                               const char  *substr,
1719                               l_int32      first,
1720                               l_int32      nfiles)
1721 {
1722 char    *fname, *fullname;
1723 l_int32  i, n, last;
1724 SARRAY  *sa, *safiles, *saout;
1725 
1726     PROCNAME("getSortedPathnamesInDirectory");
1727 
1728     if (!dirname)
1729         return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
1730 
1731     if ((sa = getFilenamesInDirectory(dirname)) == NULL)
1732         return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
1733     safiles = sarraySelectBySubstring(sa, substr);
1734     sarrayDestroy(&sa);
1735     n = sarrayGetCount(safiles);
1736     if (n == 0) {
1737         L_WARNING("no files found\n", procName);
1738         return safiles;
1739     }
1740 
1741     sarraySort(safiles, safiles, L_SORT_INCREASING);
1742 
1743     first = L_MIN(L_MAX(first, 0), n - 1);
1744     if (nfiles == 0)
1745         nfiles = n - first;
1746     last = L_MIN(first + nfiles - 1, n - 1);
1747 
1748     saout = sarrayCreate(last - first + 1);
1749     for (i = first; i <= last; i++) {
1750         fname = sarrayGetString(safiles, i, L_NOCOPY);
1751         fullname = pathJoin(dirname, fname);
1752         sarrayAddString(saout, fullname, L_INSERT);
1753     }
1754 
1755     sarrayDestroy(&safiles);
1756     return saout;
1757 }
1758 
1759 
1760 /*!
1761  * \brief   convertSortedToNumberedPathnames()
1762  *
1763  * \param[in]    sa sorted pathnames including zero-padded integers
1764  * \param[in]    numpre number of characters in name before number
1765  * \param[in]    numpost number of characters in name after the number,
1766  *                       up to a dot before an extension
1767  * \param[in]    maxnum only consider page numbers up to this value
1768  * \return  sarray of numbered pathnames, or NULL on error
1769  *
1770  * <pre>
1771  * Notes:
1772  *      (1) Typically, numpre = numpost = 0; e.g., when the filename
1773  *          just has a number followed by an optional extension.
1774  * </pre>
1775  */
1776 SARRAY *
convertSortedToNumberedPathnames(SARRAY * sa,l_int32 numpre,l_int32 numpost,l_int32 maxnum)1777 convertSortedToNumberedPathnames(SARRAY   *sa,
1778                                  l_int32   numpre,
1779                                  l_int32   numpost,
1780                                  l_int32   maxnum)
1781 {
1782 char    *fname, *str;
1783 l_int32  i, nfiles, num, index;
1784 SARRAY  *saout;
1785 
1786     PROCNAME("convertSortedToNumberedPathnames");
1787 
1788     if (!sa)
1789         return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);
1790     if ((nfiles = sarrayGetCount(sa)) == 0)
1791         return sarrayCreate(1);
1792 
1793         /* Find the last file in the sorted array that has a number
1794          * that (a) matches the count pattern and (b) does not
1795          * exceed %maxnum.  %maxnum sets an upper limit on the size
1796          * of the sarray.  */
1797     num = 0;
1798     for (i = nfiles - 1; i >= 0; i--) {
1799       fname = sarrayGetString(sa, i, L_NOCOPY);
1800       num = extractNumberFromFilename(fname, numpre, numpost);
1801       if (num < 0) continue;
1802       num = L_MIN(num + 1, maxnum);
1803       break;
1804     }
1805 
1806     if (num <= 0)  /* none found */
1807         return sarrayCreate(1);
1808 
1809         /* Insert pathnames into the output sarray.
1810          * Ignore numbers that are out of the range of sarray. */
1811     saout = sarrayCreateInitialized(num, (char *)"");
1812     for (i = 0; i < nfiles; i++) {
1813       fname = sarrayGetString(sa, i, L_NOCOPY);
1814       index = extractNumberFromFilename(fname, numpre, numpost);
1815       if (index < 0 || index >= num) continue;
1816       str = sarrayGetString(saout, index, L_NOCOPY);
1817       if (str[0] != '\0')
1818           L_WARNING("\n  Multiple files with same number: %d\n",
1819                     procName, index);
1820       sarrayReplaceString(saout, index, fname, L_COPY);
1821     }
1822 
1823     return saout;
1824 }
1825 
1826 
1827 /*!
1828  * \brief   getFilenamesInDirectory()
1829  *
1830  * \param[in]    dirname directory name
1831  * \return  sarray of file names, or NULL on error
1832  *
1833  * <pre>
1834  * Notes:
1835  *      (1) The versions compiled under unix and cygwin use the POSIX C
1836  *          library commands for handling directories.  For windows,
1837  *          there is a separate implementation.
1838  *      (2) It returns an array of filename tails; i.e., only the part of
1839  *          the path after the last slash.
1840  *      (3) Use of the d_type field of dirent is not portable:
1841  *          "According to POSIX, the dirent structure contains a field
1842  *          char d_name[] of unspecified size, with at most NAME_MAX
1843  *          characters preceding the terminating null character.  Use
1844  *          of other fields will harm the portability of your programs."
1845  *      (4) As a consequence of (3), we note several things:
1846  *           ~ MINGW doesn't have a d_type member.
1847  *           ~ Older versions of gcc (e.g., 2.95.3) return DT_UNKNOWN
1848  *             for d_type from all files.
1849  *          On these systems, this function will return directories
1850  *          (except for '.' and '..', which are eliminated using
1851  *          the d_name field).
1852  * </pre>
1853  */
1854 
1855 #ifndef _WIN32
1856 
1857 SARRAY *
getFilenamesInDirectory(const char * dirname)1858 getFilenamesInDirectory(const char  *dirname)
1859 {
1860 char            dir[PATH_MAX + 1];
1861 char           *realdir, *stat_path, *ignore;
1862 size_t          size;
1863 SARRAY         *safiles;
1864 DIR            *pdir;
1865 struct dirent  *pdirentry;
1866 int             dfd, stat_ret;
1867 struct stat     st;
1868 
1869     PROCNAME("getFilenamesInDirectory");
1870 
1871     if (!dirname)
1872         return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
1873 
1874         /* It's nice to ignore directories.  fstatat() works with relative
1875            directory paths, but stat() requires using the absolute path.
1876            Also, do not pass NULL as the second parameter to realpath();
1877            use a buffer of sufficient size. */
1878     ignore = realpath(dirname, dir);  /* see note above */
1879     realdir = genPathname(dir, NULL);
1880     if ((pdir = opendir(realdir)) == NULL) {
1881         LEPT_FREE(realdir);
1882         return (SARRAY *)ERROR_PTR("pdir not opened", procName, NULL);
1883     }
1884     safiles = sarrayCreate(0);
1885     dfd = dirfd(pdir);
1886     while ((pdirentry = readdir(pdir))) {
1887 #if HAVE_FSTATAT
1888         stat_ret = fstatat(dfd, pdirentry->d_name, &st, 0);
1889 #else
1890         size = strlen(realdir) + strlen(pdirentry->d_name) + 2;
1891         if (size > PATH_MAX) {
1892             L_ERROR("size = %lu too large; skipping\n", procName,
1893                     (unsigned long)size);
1894             continue;
1895         }
1896         stat_path = (char *)LEPT_CALLOC(size, 1);
1897         snprintf(stat_path, size, "%s/%s", realdir, pdirentry->d_name);
1898         stat_ret = stat(stat_path, &st);
1899         LEPT_FREE(stat_path);
1900 #endif
1901         if (stat_ret == 0 && S_ISDIR(st.st_mode))
1902             continue;
1903         sarrayAddString(safiles, pdirentry->d_name, L_COPY);
1904     }
1905     closedir(pdir);
1906     LEPT_FREE(realdir);
1907     return safiles;
1908 }
1909 
1910 #else  /* _WIN32 */
1911 
1912     /* http://msdn2.microsoft.com/en-us/library/aa365200(VS.85).aspx */
1913 #include <windows.h>
1914 
1915 SARRAY *
getFilenamesInDirectory(const char * dirname)1916 getFilenamesInDirectory(const char  *dirname)
1917 {
1918 char             *pszDir;
1919 char             *realdir;
1920 HANDLE            hFind = INVALID_HANDLE_VALUE;
1921 SARRAY           *safiles;
1922 WIN32_FIND_DATAA  ffd;
1923 
1924     PROCNAME("getFilenamesInDirectory");
1925 
1926     if (!dirname)
1927         return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
1928 
1929     realdir = genPathname(dirname, NULL);
1930     pszDir = stringJoin(realdir, "\\*");
1931     LEPT_FREE(realdir);
1932 
1933     if (strlen(pszDir) + 1 > MAX_PATH) {
1934         LEPT_FREE(pszDir);
1935         return (SARRAY *)ERROR_PTR("dirname is too long", procName, NULL);
1936     }
1937 
1938     if ((safiles = sarrayCreate(0)) == NULL) {
1939         LEPT_FREE(pszDir);
1940         return (SARRAY *)ERROR_PTR("safiles not made", procName, NULL);
1941     }
1942 
1943     hFind = FindFirstFileA(pszDir, &ffd);
1944     if (INVALID_HANDLE_VALUE == hFind) {
1945         sarrayDestroy(&safiles);
1946         LEPT_FREE(pszDir);
1947         return (SARRAY *)ERROR_PTR("hFind not opened", procName, NULL);
1948     }
1949 
1950     while (FindNextFileA(hFind, &ffd) != 0) {
1951         if (ffd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)  /* skip dirs */
1952             continue;
1953         convertSepCharsInPath(ffd.cFileName, UNIX_PATH_SEPCHAR);
1954         sarrayAddString(safiles, ffd.cFileName, L_COPY);
1955     }
1956 
1957     FindClose(hFind);
1958     LEPT_FREE(pszDir);
1959     return safiles;
1960 }
1961 #endif  /* _WIN32 */
1962