1 /*====================================================================*
2 - Copyright (C) 2001 Leptonica. All rights reserved.
3 -
4 - Redistribution and use in source and binary forms, with or without
5 - modification, are permitted provided that the following conditions
6 - are met:
7 - 1. Redistributions of source code must retain the above copyright
8 - notice, this list of conditions and the following disclaimer.
9 - 2. Redistributions in binary form must reproduce the above
10 - copyright notice, this list of conditions and the following
11 - disclaimer in the documentation and/or other materials
12 - provided with the distribution.
13 -
14 - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15 - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16 - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17 - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY
18 - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23 - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *====================================================================*/
26
27 /*!
28 * \file sarray1.c
29 * <pre>
30 *
31 * Create/Destroy/Copy
32 * SARRAY *sarrayCreate()
33 * SARRAY *sarrayCreateInitialized()
34 * SARRAY *sarrayCreateWordsFromString()
35 * SARRAY *sarrayCreateLinesFromString()
36 * void *sarrayDestroy()
37 * SARRAY *sarrayCopy()
38 * SARRAY *sarrayClone()
39 *
40 * Add/Remove string
41 * l_int32 sarrayAddString()
42 * static l_int32 sarrayExtendArray()
43 * char *sarrayRemoveString()
44 * l_int32 sarrayReplaceString()
45 * l_int32 sarrayClear()
46 *
47 * Accessors
48 * l_int32 sarrayGetCount()
49 * char **sarrayGetArray()
50 * char *sarrayGetString()
51 * l_int32 sarrayGetRefcount()
52 * l_int32 sarrayChangeRefcount()
53 *
54 * Conversion back to string
55 * char *sarrayToString()
56 * char *sarrayToStringRange()
57 *
58 * Join 2 sarrays
59 * l_int32 sarrayJoin()
60 * l_int32 sarrayAppendRange()
61 *
62 * Pad an sarray to be the same size as another sarray
63 * l_int32 sarrayPadToSameSize()
64 *
65 * Convert word sarray to (formatted) line sarray
66 * SARRAY *sarrayConvertWordsToLines()
67 *
68 * Split string on separator list
69 * SARRAY *sarraySplitString()
70 *
71 * Filter sarray
72 * SARRAY *sarraySelectBySubstring()
73 * SARRAY *sarraySelectByRange()
74 * l_int32 sarrayParseRange()
75 *
76 * Serialize for I/O
77 * SARRAY *sarrayRead()
78 * SARRAY *sarrayReadStream()
79 * SARRAY *sarrayReadMem()
80 * l_int32 sarrayWrite()
81 * l_int32 sarrayWriteStream()
82 * l_int32 sarrayWriteMem()
83 * l_int32 sarrayAppend()
84 *
85 * Directory filenames
86 * SARRAY *getNumberedPathnamesInDirectory()
87 * SARRAY *getSortedPathnamesInDirectory()
88 * SARRAY *convertSortedToNumberedPathnames()
89 * SARRAY *getFilenamesInDirectory()
90 *
91 * These functions are important for efficient manipulation
92 * of string data, and they have found widespread use in
93 * leptonica. For example:
94 * (1) to generate text files: e.g., PostScript and PDF
95 * wrappers around sets of images
96 * (2) to parse text files: e.g., extracting prototypes
97 * from the source to generate allheaders.h
98 * (3) to generate code for compilation: e.g., the fast
99 * dwa code for arbitrary structuring elements.
100 *
101 * Comments on usage:
102 *
103 * The user is responsible for correctly disposing of strings
104 * that have been extracted from sarrays. In the following,
105 * "str_not_owned" means the returned handle does not own the string,
106 * and "str_owned" means the returned handle owns the string.
107 * - To extract a string from an Sarray in order to inspect it
108 * or to make a copy of it later, get a handle to it:
109 * copyflag = L_NOCOPY.
110 * In this case, you must neither free the string nor put it
111 * directly in another array:
112 * str-not-owned = sarrayGetString(sa, index, L_NOCOPY);
113 * - To extract a copy of a string from an Sarray, use:
114 * str-owned = sarrayGetString(sa, index, L_COPY);
115 * ~ To insert a string that is in one array into another
116 * array (always leaving the first array intact), there are
117 * two options:
118 * (1) use copyflag = L_COPY to make an immediate copy,
119 * which you then add to the second array by insertion:
120 * str-owned = sarrayGetString(sa, index, L_COPY);
121 * sarrayAddString(sa, str-owned, L_INSERT);
122 * (2) use copyflag = L_NOCOPY to get another handle to
123 * the string; you then add a copy of it to the
124 * second string array:
125 * str-not-owned = sarrayGetString(sa, index, L_NOCOPY);
126 * sarrayAddString(sa, str-not-owned, L_COPY).
127 * sarrayAddString() transfers ownership to the Sarray, so never
128 * use L_INSERT if the string is owned by another array.
129 *
130 * In all cases, when you use copyflag = L_COPY to extract
131 * a string from an array, you must either free it
132 * or insert it in an array that will be freed later.
133 * </pre>
134 */
135
136 #include <string.h>
137 #ifndef _WIN32
138 #include <dirent.h> /* unix only */
139 #include <sys/stat.h>
140 #include <limits.h> /* needed for realpath() */
141 #include <stdlib.h> /* needed for realpath() */
142 #endif /* ! _WIN32 */
143 #include "allheaders.h"
144
145 static const l_int32 INITIAL_PTR_ARRAYSIZE = 50; /* n'importe quoi */
146 static const l_int32 L_BUF_SIZE = 512;
147
148 /* Static functions */
149 static l_int32 sarrayExtendArray(SARRAY *sa);
150
151
152 /*--------------------------------------------------------------------------*
153 * String array create/destroy/copy/extend *
154 *--------------------------------------------------------------------------*/
155 /*!
156 * \brief sarrayCreate()
157 *
158 * \param[in] n size of string ptr array to be alloc'd;
159 * use 0 for default
160 * \return sarray, or NULL on error
161 */
162 SARRAY *
sarrayCreate(l_int32 n)163 sarrayCreate(l_int32 n)
164 {
165 SARRAY *sa;
166
167 PROCNAME("sarrayCreate");
168
169 if (n <= 0)
170 n = INITIAL_PTR_ARRAYSIZE;
171
172 sa = (SARRAY *)LEPT_CALLOC(1, sizeof(SARRAY));
173 if ((sa->array = (char **)LEPT_CALLOC(n, sizeof(char *))) == NULL) {
174 sarrayDestroy(&sa);
175 return (SARRAY *)ERROR_PTR("ptr array not made", procName, NULL);
176 }
177
178 sa->nalloc = n;
179 sa->n = 0;
180 sa->refcount = 1;
181 return sa;
182 }
183
184
185 /*!
186 * \brief sarrayCreateInitialized()
187 *
188 * \param[in] n size of string ptr array to be alloc'd
189 * \param[in] initstr string to be initialized on the full array
190 * \return sarray, or NULL on error
191 */
192 SARRAY *
sarrayCreateInitialized(l_int32 n,char * initstr)193 sarrayCreateInitialized(l_int32 n,
194 char *initstr)
195 {
196 l_int32 i;
197 SARRAY *sa;
198
199 PROCNAME("sarrayCreateInitialized");
200
201 if (n <= 0)
202 return (SARRAY *)ERROR_PTR("n must be > 0", procName, NULL);
203 if (!initstr)
204 return (SARRAY *)ERROR_PTR("initstr not defined", procName, NULL);
205
206 sa = sarrayCreate(n);
207 for (i = 0; i < n; i++)
208 sarrayAddString(sa, initstr, L_COPY);
209 return sa;
210 }
211
212
213 /*!
214 * \brief sarrayCreateWordsFromString()
215 *
216 * \param[in] string
217 * \return sarray, or NULL on error
218 *
219 * <pre>
220 * Notes:
221 * (1) This finds the number of word substrings, creates an sarray
222 * of this size, and puts copies of each substring into the sarray.
223 * </pre>
224 */
225 SARRAY *
sarrayCreateWordsFromString(const char * string)226 sarrayCreateWordsFromString(const char *string)
227 {
228 char separators[] = " \n\t";
229 l_int32 i, nsub, size, inword;
230 SARRAY *sa;
231
232 PROCNAME("sarrayCreateWordsFromString");
233
234 if (!string)
235 return (SARRAY *)ERROR_PTR("textstr not defined", procName, NULL);
236
237 /* Find the number of words */
238 size = strlen(string);
239 nsub = 0;
240 inword = FALSE;
241 for (i = 0; i < size; i++) {
242 if (inword == FALSE &&
243 (string[i] != ' ' && string[i] != '\t' && string[i] != '\n')) {
244 inword = TRUE;
245 nsub++;
246 } else if (inword == TRUE &&
247 (string[i] == ' ' || string[i] == '\t' || string[i] == '\n')) {
248 inword = FALSE;
249 }
250 }
251
252 if ((sa = sarrayCreate(nsub)) == NULL)
253 return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
254 sarraySplitString(sa, string, separators);
255
256 return sa;
257 }
258
259
260 /*!
261 * \brief sarrayCreateLinesFromString()
262 *
263 * \param[in] string
264 * \param[in] blankflag 0 to exclude blank lines; 1 to include
265 * \return sarray, or NULL on error
266 *
267 * <pre>
268 * Notes:
269 * (1) This finds the number of line substrings, each of which
270 * ends with a newline, and puts a copy of each substring
271 * in a new sarray.
272 * (2) The newline characters are removed from each substring.
273 * </pre>
274 */
275 SARRAY *
sarrayCreateLinesFromString(const char * string,l_int32 blankflag)276 sarrayCreateLinesFromString(const char *string,
277 l_int32 blankflag)
278 {
279 l_int32 i, nsub, size, startptr;
280 char *cstring, *substring;
281 SARRAY *sa;
282
283 PROCNAME("sarrayCreateLinesFromString");
284
285 if (!string)
286 return (SARRAY *)ERROR_PTR("textstr not defined", procName, NULL);
287
288 /* Find the number of lines */
289 size = strlen(string);
290 nsub = 0;
291 for (i = 0; i < size; i++) {
292 if (string[i] == '\n')
293 nsub++;
294 }
295
296 if ((sa = sarrayCreate(nsub)) == NULL)
297 return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
298
299 if (blankflag) { /* keep blank lines as null strings */
300 /* Make a copy for munging */
301 if ((cstring = stringNew(string)) == NULL) {
302 sarrayDestroy(&sa);
303 return (SARRAY *)ERROR_PTR("cstring not made", procName, NULL);
304 }
305 /* We'll insert nulls like strtok */
306 startptr = 0;
307 for (i = 0; i < size; i++) {
308 if (cstring[i] == '\n') {
309 cstring[i] = '\0';
310 if (i > 0 && cstring[i - 1] == '\r')
311 cstring[i - 1] = '\0'; /* also remove Windows CR */
312 if ((substring = stringNew(cstring + startptr)) == NULL) {
313 sarrayDestroy(&sa);
314 LEPT_FREE(cstring);
315 return (SARRAY *)ERROR_PTR("substring not made",
316 procName, NULL);
317 }
318 sarrayAddString(sa, substring, L_INSERT);
319 /* fprintf(stderr, "substring = %s\n", substring); */
320 startptr = i + 1;
321 }
322 }
323 if (startptr < size) { /* no newline at end of last line */
324 if ((substring = stringNew(cstring + startptr)) == NULL) {
325 sarrayDestroy(&sa);
326 LEPT_FREE(cstring);
327 return (SARRAY *)ERROR_PTR("substring not made",
328 procName, NULL);
329 }
330 sarrayAddString(sa, substring, L_INSERT);
331 /* fprintf(stderr, "substring = %s\n", substring); */
332 }
333 LEPT_FREE(cstring);
334 } else { /* remove blank lines; use strtok */
335 sarraySplitString(sa, string, "\r\n");
336 }
337
338 return sa;
339 }
340
341
342 /*!
343 * \brief sarrayDestroy()
344 *
345 * \param[in,out] psa to be nulled
346 * \return void
347 *
348 * <pre>
349 * Notes:
350 * (1) Decrements the ref count and, if 0, destroys the sarray.
351 * (2) Always nulls the input ptr.
352 * </pre>
353 */
354 void
sarrayDestroy(SARRAY ** psa)355 sarrayDestroy(SARRAY **psa)
356 {
357 l_int32 i;
358 SARRAY *sa;
359
360 PROCNAME("sarrayDestroy");
361
362 if (psa == NULL) {
363 L_WARNING("ptr address is NULL!\n", procName);
364 return;
365 }
366 if ((sa = *psa) == NULL)
367 return;
368
369 sarrayChangeRefcount(sa, -1);
370 if (sarrayGetRefcount(sa) <= 0) {
371 if (sa->array) {
372 for (i = 0; i < sa->n; i++) {
373 if (sa->array[i])
374 LEPT_FREE(sa->array[i]);
375 }
376 LEPT_FREE(sa->array);
377 }
378 LEPT_FREE(sa);
379 }
380
381 *psa = NULL;
382 return;
383 }
384
385
386 /*!
387 * \brief sarrayCopy()
388 *
389 * \param[in] sa string array
390 * \return copy of sarray, or NULL on error
391 */
392 SARRAY *
sarrayCopy(SARRAY * sa)393 sarrayCopy(SARRAY *sa)
394 {
395 l_int32 i;
396 SARRAY *csa;
397
398 PROCNAME("sarrayCopy");
399
400 if (!sa)
401 return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);
402
403 if ((csa = sarrayCreate(sa->nalloc)) == NULL)
404 return (SARRAY *)ERROR_PTR("csa not made", procName, NULL);
405
406 for (i = 0; i < sa->n; i++)
407 sarrayAddString(csa, sa->array[i], L_COPY);
408
409 return csa;
410 }
411
412
413 /*!
414 * \brief sarrayClone()
415 *
416 * \param[in] sa string array
417 * \return ptr to same sarray, or NULL on error
418 */
419 SARRAY *
sarrayClone(SARRAY * sa)420 sarrayClone(SARRAY *sa)
421 {
422 PROCNAME("sarrayClone");
423
424 if (!sa)
425 return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);
426 sarrayChangeRefcount(sa, 1);
427 return sa;
428 }
429
430
431 /*!
432 * \brief sarrayAddString()
433 *
434 * \param[in] sa string array
435 * \param[in] string string to be added
436 * \param[in] copyflag L_INSERT, L_NOCOPY or L_COPY
437 * \return 0 if OK, 1 on error
438 *
439 * <pre>
440 * Notes:
441 * (1) See usage comments at the top of this file. L_INSERT is
442 * equivalent to L_NOCOPY.
443 * </pre>
444 */
445 l_int32
sarrayAddString(SARRAY * sa,char * string,l_int32 copyflag)446 sarrayAddString(SARRAY *sa,
447 char *string,
448 l_int32 copyflag)
449 {
450 l_int32 n;
451
452 PROCNAME("sarrayAddString");
453
454 if (!sa)
455 return ERROR_INT("sa not defined", procName, 1);
456 if (!string)
457 return ERROR_INT("string not defined", procName, 1);
458 if (copyflag != L_INSERT && copyflag != L_NOCOPY && copyflag != L_COPY)
459 return ERROR_INT("invalid copyflag", procName, 1);
460
461 n = sarrayGetCount(sa);
462 if (n >= sa->nalloc)
463 sarrayExtendArray(sa);
464
465 if (copyflag == L_COPY)
466 sa->array[n] = stringNew(string);
467 else /* L_INSERT or L_NOCOPY */
468 sa->array[n] = string;
469 sa->n++;
470
471 return 0;
472 }
473
474
475 /*!
476 * \brief sarrayExtendArray()
477 *
478 * \param[in] sa string array
479 * \return 0 if OK, 1 on error
480 */
481 static l_int32
sarrayExtendArray(SARRAY * sa)482 sarrayExtendArray(SARRAY *sa)
483 {
484 PROCNAME("sarrayExtendArray");
485
486 if (!sa)
487 return ERROR_INT("sa not defined", procName, 1);
488
489 if ((sa->array = (char **)reallocNew((void **)&sa->array,
490 sizeof(char *) * sa->nalloc,
491 2 * sizeof(char *) * sa->nalloc)) == NULL)
492 return ERROR_INT("new ptr array not returned", procName, 1);
493
494 sa->nalloc *= 2;
495 return 0;
496 }
497
498
499 /*!
500 * \brief sarrayRemoveString()
501 *
502 * \param[in] sa string array
503 * \param[in] index of string within sarray
504 * \return removed string, or NULL on error
505 */
506 char *
sarrayRemoveString(SARRAY * sa,l_int32 index)507 sarrayRemoveString(SARRAY *sa,
508 l_int32 index)
509 {
510 char *string;
511 char **array;
512 l_int32 i, n, nalloc;
513
514 PROCNAME("sarrayRemoveString");
515
516 if (!sa)
517 return (char *)ERROR_PTR("sa not defined", procName, NULL);
518
519 if ((array = sarrayGetArray(sa, &nalloc, &n)) == NULL)
520 return (char *)ERROR_PTR("array not returned", procName, NULL);
521
522 if (index < 0 || index >= n)
523 return (char *)ERROR_PTR("array index out of bounds", procName, NULL);
524
525 string = array[index];
526
527 /* If removed string is not at end of array, shift
528 * to fill in, maintaining original ordering.
529 * Note: if we didn't care about the order, we could
530 * put the last string array[n - 1] directly into the hole. */
531 for (i = index; i < n - 1; i++)
532 array[i] = array[i + 1];
533
534 sa->n--;
535 return string;
536 }
537
538
539 /*!
540 * \brief sarrayReplaceString()
541 *
542 * \param[in] sa string array
543 * \param[in] index of string within sarray to be replaced
544 * \param[in] newstr string to replace existing one
545 * \param[in] copyflag L_INSERT, L_COPY
546 * \return 0 if OK, 1 on error
547 *
548 * <pre>
549 * Notes:
550 * (1) This destroys an existing string and replaces it with
551 * the new string or a copy of it.
552 * (2) By design, an sarray is always compacted, so there are
553 * never any holes (null ptrs) in the ptr array up to the
554 * current count.
555 * </pre>
556 */
557 l_int32
sarrayReplaceString(SARRAY * sa,l_int32 index,char * newstr,l_int32 copyflag)558 sarrayReplaceString(SARRAY *sa,
559 l_int32 index,
560 char *newstr,
561 l_int32 copyflag)
562 {
563 char *str;
564 l_int32 n;
565
566 PROCNAME("sarrayReplaceString");
567
568 if (!sa)
569 return ERROR_INT("sa not defined", procName, 1);
570 n = sarrayGetCount(sa);
571 if (index < 0 || index >= n)
572 return ERROR_INT("array index out of bounds", procName, 1);
573 if (!newstr)
574 return ERROR_INT("newstr not defined", procName, 1);
575 if (copyflag != L_INSERT && copyflag != L_COPY)
576 return ERROR_INT("invalid copyflag", procName, 1);
577
578 LEPT_FREE(sa->array[index]);
579 if (copyflag == L_INSERT)
580 str = newstr;
581 else /* L_COPY */
582 str = stringNew(newstr);
583 sa->array[index] = str;
584 return 0;
585 }
586
587
588 /*!
589 * \brief sarrayClear()
590 *
591 * \param[in] sa string array
592 * \return 0 if OK; 1 on error
593 */
594 l_int32
sarrayClear(SARRAY * sa)595 sarrayClear(SARRAY *sa)
596 {
597 l_int32 i;
598
599 PROCNAME("sarrayClear");
600
601 if (!sa)
602 return ERROR_INT("sa not defined", procName, 1);
603 for (i = 0; i < sa->n; i++) { /* free strings and null ptrs */
604 LEPT_FREE(sa->array[i]);
605 sa->array[i] = NULL;
606 }
607 sa->n = 0;
608 return 0;
609 }
610
611
612 /*----------------------------------------------------------------------*
613 * Accessors *
614 *----------------------------------------------------------------------*/
615 /*!
616 * \brief sarrayGetCount()
617 *
618 * \param[in] sa string array
619 * \return count, or 0 if no strings or on error
620 */
621 l_int32
sarrayGetCount(SARRAY * sa)622 sarrayGetCount(SARRAY *sa)
623 {
624 PROCNAME("sarrayGetCount");
625
626 if (!sa)
627 return ERROR_INT("sa not defined", procName, 0);
628 return sa->n;
629 }
630
631
632 /*!
633 * \brief sarrayGetArray()
634 *
635 * \param[in] sa string array
636 * \param[out] pnalloc [optional] number allocated string ptrs
637 * \param[out] pn [optional] number allocated strings
638 * \return ptr to string array, or NULL on error
639 *
640 * <pre>
641 * Notes:
642 * (1) Caution: the returned array is not a copy, so caller
643 * must not destroy it!
644 * </pre>
645 */
646 char **
sarrayGetArray(SARRAY * sa,l_int32 * pnalloc,l_int32 * pn)647 sarrayGetArray(SARRAY *sa,
648 l_int32 *pnalloc,
649 l_int32 *pn)
650 {
651 char **array;
652
653 PROCNAME("sarrayGetArray");
654
655 if (!sa)
656 return (char **)ERROR_PTR("sa not defined", procName, NULL);
657
658 array = sa->array;
659 if (pnalloc) *pnalloc = sa->nalloc;
660 if (pn) *pn = sa->n;
661
662 return array;
663 }
664
665
666 /*!
667 * \brief sarrayGetString()
668 *
669 * \param[in] sa string array
670 * \param[in] index to the index-th string
671 * \param[in] copyflag L_NOCOPY or L_COPY
672 * \return string, or NULL on error
673 *
674 * <pre>
675 * Notes:
676 * (1) See usage comments at the top of this file.
677 * (2) To get a pointer to the string itself, use L_NOCOPY.
678 * To get a copy of the string, use L_COPY.
679 * </pre>
680 */
681 char *
sarrayGetString(SARRAY * sa,l_int32 index,l_int32 copyflag)682 sarrayGetString(SARRAY *sa,
683 l_int32 index,
684 l_int32 copyflag)
685 {
686 PROCNAME("sarrayGetString");
687
688 if (!sa)
689 return (char *)ERROR_PTR("sa not defined", procName, NULL);
690 if (index < 0 || index >= sa->n)
691 return (char *)ERROR_PTR("index not valid", procName, NULL);
692 if (copyflag != L_NOCOPY && copyflag != L_COPY)
693 return (char *)ERROR_PTR("invalid copyflag", procName, NULL);
694
695 if (copyflag == L_NOCOPY)
696 return sa->array[index];
697 else /* L_COPY */
698 return stringNew(sa->array[index]);
699 }
700
701
702 /*!
703 * \brief sarrayGetRefCount()
704 *
705 * \param[in] sa string array
706 * \return refcount, or UNDEF on error
707 */
708 l_int32
sarrayGetRefcount(SARRAY * sa)709 sarrayGetRefcount(SARRAY *sa)
710 {
711 PROCNAME("sarrayGetRefcount");
712
713 if (!sa)
714 return ERROR_INT("sa not defined", procName, UNDEF);
715 return sa->refcount;
716 }
717
718
719 /*!
720 * \brief sarrayChangeRefCount()
721 *
722 * \param[in] sa string array
723 * \param[in] delta change to be applied
724 * \return 0 if OK, 1 on error
725 */
726 l_int32
sarrayChangeRefcount(SARRAY * sa,l_int32 delta)727 sarrayChangeRefcount(SARRAY *sa,
728 l_int32 delta)
729 {
730 PROCNAME("sarrayChangeRefcount");
731
732 if (!sa)
733 return ERROR_INT("sa not defined", procName, UNDEF);
734 sa->refcount += delta;
735 return 0;
736 }
737
738
739 /*----------------------------------------------------------------------*
740 * Conversion to string *
741 *----------------------------------------------------------------------*/
742 /*!
743 * \brief sarrayToString()
744 *
745 * \param[in] sa string array
746 * \param[in] addnlflag flag: 0 adds nothing to each substring
747 * 1 adds '\n' to each substring
748 * 2 adds ' ' to each substring
749 * \return dest string, or NULL on error
750 *
751 * <pre>
752 * Notes:
753 * (1) Concatenates all the strings in the sarray, preserving
754 * all white space.
755 * (2) If addnlflag != 0, adds either a '\n' or a ' ' after
756 * each substring.
757 * (3) This function was NOT implemented as:
758 * for (i = 0; i < n; i++)
759 * strcat(dest, sarrayGetString(sa, i, L_NOCOPY));
760 * Do you see why?
761 * </pre>
762 */
763 char *
sarrayToString(SARRAY * sa,l_int32 addnlflag)764 sarrayToString(SARRAY *sa,
765 l_int32 addnlflag)
766 {
767 PROCNAME("sarrayToString");
768
769 if (!sa)
770 return (char *)ERROR_PTR("sa not defined", procName, NULL);
771
772 return sarrayToStringRange(sa, 0, 0, addnlflag);
773 }
774
775
776 /*!
777 * \brief sarrayToStringRange()
778 *
779 * \param[in] sa string array
780 * \param[in] first index of first string to use; starts with 0
781 * \param[in] nstrings number of strings to append into the result; use
782 * 0 to append to the end of the sarray
783 * \param[in] addnlflag flag: 0 adds nothing to each substring
784 * 1 adds '\n' to each substring
785 * 2 adds ' ' to each substring
786 * \return dest string, or NULL on error
787 *
788 * <pre>
789 * Notes:
790 * (1) Concatenates the specified strings inthe sarray, preserving
791 * all white space.
792 * (2) If addnlflag != 0, adds either a '\n' or a ' ' after
793 * each substring.
794 * (3) If the sarray is empty, this returns a string with just
795 * the character corresponding to %addnlflag.
796 * </pre>
797 */
798 char *
sarrayToStringRange(SARRAY * sa,l_int32 first,l_int32 nstrings,l_int32 addnlflag)799 sarrayToStringRange(SARRAY *sa,
800 l_int32 first,
801 l_int32 nstrings,
802 l_int32 addnlflag)
803 {
804 char *dest, *src, *str;
805 l_int32 n, i, last, size, index, len;
806
807 PROCNAME("sarrayToStringRange");
808
809 if (!sa)
810 return (char *)ERROR_PTR("sa not defined", procName, NULL);
811 if (addnlflag != 0 && addnlflag != 1 && addnlflag != 2)
812 return (char *)ERROR_PTR("invalid addnlflag", procName, NULL);
813
814 n = sarrayGetCount(sa);
815
816 /* Empty sa; return char corresponding to addnlflag only */
817 if (n == 0) {
818 if (first == 0) {
819 if (addnlflag == 0)
820 return stringNew("");
821 if (addnlflag == 1)
822 return stringNew("\n");
823 else /* addnlflag == 2) */
824 return stringNew(" ");
825 } else {
826 return (char *)ERROR_PTR("first not valid", procName, NULL);
827 }
828 }
829
830 if (first < 0 || first >= n)
831 return (char *)ERROR_PTR("first not valid", procName, NULL);
832 if (nstrings == 0 || (nstrings > n - first))
833 nstrings = n - first; /* no overflow */
834 last = first + nstrings - 1;
835
836 size = 0;
837 for (i = first; i <= last; i++) {
838 if ((str = sarrayGetString(sa, i, L_NOCOPY)) == NULL)
839 return (char *)ERROR_PTR("str not found", procName, NULL);
840 size += strlen(str) + 2;
841 }
842
843 if ((dest = (char *)LEPT_CALLOC(size + 1, sizeof(char))) == NULL)
844 return (char *)ERROR_PTR("dest not made", procName, NULL);
845
846 index = 0;
847 for (i = first; i <= last; i++) {
848 src = sarrayGetString(sa, i, L_NOCOPY);
849 len = strlen(src);
850 memcpy(dest + index, src, len);
851 index += len;
852 if (addnlflag == 1) {
853 dest[index] = '\n';
854 index++;
855 } else if (addnlflag == 2) {
856 dest[index] = ' ';
857 index++;
858 }
859 }
860
861 return dest;
862 }
863
864
865 /*----------------------------------------------------------------------*
866 * Join 2 sarrays *
867 *----------------------------------------------------------------------*/
868 /*!
869 * \brief sarrayJoin()
870 *
871 * \param[in] sa1 to be added to
872 * \param[in] sa2 append to sa1
873 * \return 0 if OK, 1 on error
874 *
875 * <pre>
876 * Notes:
877 * (1) Copies of the strings in sarray2 are added to sarray1.
878 * </pre>
879 */
880 l_int32
sarrayJoin(SARRAY * sa1,SARRAY * sa2)881 sarrayJoin(SARRAY *sa1,
882 SARRAY *sa2)
883 {
884 char *str;
885 l_int32 n, i;
886
887 PROCNAME("sarrayJoin");
888
889 if (!sa1)
890 return ERROR_INT("sa1 not defined", procName, 1);
891 if (!sa2)
892 return ERROR_INT("sa2 not defined", procName, 1);
893
894 n = sarrayGetCount(sa2);
895 for (i = 0; i < n; i++) {
896 str = sarrayGetString(sa2, i, L_NOCOPY);
897 sarrayAddString(sa1, str, L_COPY);
898 }
899
900 return 0;
901 }
902
903
904 /*!
905 * \brief sarrayAppendRange()
906 *
907 * \param[in] sa1 to be added to
908 * \param[in] sa2 append specified range of strings in sa2 to sa1
909 * \param[in] start index of first string of sa2 to append
910 * \param[in] end index of last string of sa2 to append; -1 to end of array
911 * \return 0 if OK, 1 on error
912 *
913 * <pre>
914 * Notes:
915 * (1) Copies of the strings in sarray2 are added to sarray1.
916 * (2) The [start ... end] range is truncated if necessary.
917 * (3) Use end == -1 to append to the end of sa2.
918 * </pre>
919 */
920 l_int32
sarrayAppendRange(SARRAY * sa1,SARRAY * sa2,l_int32 start,l_int32 end)921 sarrayAppendRange(SARRAY *sa1,
922 SARRAY *sa2,
923 l_int32 start,
924 l_int32 end)
925 {
926 char *str;
927 l_int32 n, i;
928
929 PROCNAME("sarrayAppendRange");
930
931 if (!sa1)
932 return ERROR_INT("sa1 not defined", procName, 1);
933 if (!sa2)
934 return ERROR_INT("sa2 not defined", procName, 1);
935
936 if (start < 0)
937 start = 0;
938 n = sarrayGetCount(sa2);
939 if (end < 0 || end >= n)
940 end = n - 1;
941 if (start > end)
942 return ERROR_INT("start > end", procName, 1);
943
944 for (i = start; i <= end; i++) {
945 str = sarrayGetString(sa2, i, L_NOCOPY);
946 sarrayAddString(sa1, str, L_COPY);
947 }
948
949 return 0;
950 }
951
952
953 /*----------------------------------------------------------------------*
954 * Pad an sarray to be the same size as another sarray *
955 *----------------------------------------------------------------------*/
956 /*!
957 * \brief sarrayPadToSameSize()
958 *
959 * \param[in] sa1, sa2
960 * \param[in] padstring
961 * \return 0 if OK, 1 on error
962 *
963 * <pre>
964 * Notes:
965 * (1) If two sarrays have different size, this adds enough
966 * instances of %padstring to the smaller so that they are
967 * the same size. It is useful when two or more sarrays
968 * are being sequenced in parallel, and it is necessary to
969 * find a valid string at each index.
970 * </pre>
971 */
972 l_int32
sarrayPadToSameSize(SARRAY * sa1,SARRAY * sa2,char * padstring)973 sarrayPadToSameSize(SARRAY *sa1,
974 SARRAY *sa2,
975 char *padstring)
976 {
977 l_int32 i, n1, n2;
978
979 PROCNAME("sarrayPadToSameSize");
980
981 if (!sa1 || !sa2)
982 return ERROR_INT("both sa1 and sa2 not defined", procName, 1);
983
984 n1 = sarrayGetCount(sa1);
985 n2 = sarrayGetCount(sa2);
986 if (n1 < n2) {
987 for (i = n1; i < n2; i++)
988 sarrayAddString(sa1, padstring, L_COPY);
989 } else if (n1 > n2) {
990 for (i = n2; i < n1; i++)
991 sarrayAddString(sa2, padstring, L_COPY);
992 }
993
994 return 0;
995 }
996
997
998 /*----------------------------------------------------------------------*
999 * Convert word sarray to line sarray *
1000 *----------------------------------------------------------------------*/
1001 /*!
1002 * \brief sarrayConvertWordsToLines()
1003 *
1004 * \param[in] sa sa of individual words
1005 * \param[in] linesize max num of chars in each line
1006 * \return saout sa of formatted lines, or NULL on error
1007 *
1008 * This is useful for re-typesetting text to a specific maximum
1009 * line length. The individual words in the input sarray
1010 * are concatenated into textlines. An input word string of zero
1011 * length is taken to be a paragraph separator. Each time
1012 * such a string is found, the current line is ended and
1013 * a new line is also produced that contains just the
1014 * string of zero length "". When the output sarray
1015 * of lines is eventually converted to a string with newlines
1016 * typically appended to each line string, the empty
1017 * strings are just converted to newlines, producing the visible
1018 * paragraph separation.
1019 *
1020 * What happens when a word is larger than linesize?
1021 * We write it out as a single line anyway! Words preceding
1022 * or following this long word are placed on lines preceding
1023 * or following the line with the long word. Why this choice?
1024 * Long "words" found in text documents are typically URLs, and
1025 * it's often desirable not to put newlines in the middle of a URL.
1026 * The text display program e.g., text editor will typically
1027 * wrap the long "word" to fit in the window.
1028 */
1029 SARRAY *
sarrayConvertWordsToLines(SARRAY * sa,l_int32 linesize)1030 sarrayConvertWordsToLines(SARRAY *sa,
1031 l_int32 linesize)
1032 {
1033 char *wd, *strl;
1034 char emptystring[] = "";
1035 l_int32 n, i, len, totlen;
1036 SARRAY *sal, *saout;
1037
1038 PROCNAME("sarrayConvertWordsToLines");
1039
1040 if (!sa)
1041 return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);
1042
1043 saout = sarrayCreate(0);
1044 n = sarrayGetCount(sa);
1045 totlen = 0;
1046 sal = NULL;
1047 for (i = 0; i < n; i++) {
1048 if (!sal)
1049 sal = sarrayCreate(0);
1050 wd = sarrayGetString(sa, i, L_NOCOPY);
1051 len = strlen(wd);
1052 if (len == 0) { /* end of paragraph: end line & insert blank line */
1053 if (totlen > 0) {
1054 strl = sarrayToString(sal, 2);
1055 sarrayAddString(saout, strl, L_INSERT);
1056 }
1057 sarrayAddString(saout, emptystring, L_COPY);
1058 sarrayDestroy(&sal);
1059 totlen = 0;
1060 } else if (totlen == 0 && len + 1 > linesize) { /* long word! */
1061 sarrayAddString(saout, wd, L_COPY); /* copy to one line */
1062 } else if (totlen + len + 1 > linesize) { /* end line & start new */
1063 strl = sarrayToString(sal, 2);
1064 sarrayAddString(saout, strl, L_INSERT);
1065 sarrayDestroy(&sal);
1066 sal = sarrayCreate(0);
1067 sarrayAddString(sal, wd, L_COPY);
1068 totlen = len + 1;
1069 } else { /* add to current line */
1070 sarrayAddString(sal, wd, L_COPY);
1071 totlen += len + 1;
1072 }
1073 }
1074 if (totlen > 0) { /* didn't end with blank line; output last line */
1075 strl = sarrayToString(sal, 2);
1076 sarrayAddString(saout, strl, L_INSERT);
1077 sarrayDestroy(&sal);
1078 }
1079
1080 return saout;
1081 }
1082
1083
1084 /*----------------------------------------------------------------------*
1085 * Split string on separator list *
1086 *----------------------------------------------------------------------*/
1087 /*
1088 * sarraySplitString()
1089 *
1090 * Input: sa (to append to; typically empty initially)
1091 * str (string to split; not changed)
1092 * separators (characters that split input string)
1093 * Return: 0 if OK, 1 on error.
1094 *
1095 * Notes:
1096 * (1) This uses strtokSafe(). See the notes there in utils.c.
1097 */
1098 l_int32
sarraySplitString(SARRAY * sa,const char * str,const char * separators)1099 sarraySplitString(SARRAY *sa,
1100 const char *str,
1101 const char *separators)
1102 {
1103 char *cstr, *substr, *saveptr;
1104
1105 PROCNAME("sarraySplitString");
1106
1107 if (!sa)
1108 return ERROR_INT("sa not defined", procName, 1);
1109 if (!str)
1110 return ERROR_INT("str not defined", procName, 1);
1111 if (!separators)
1112 return ERROR_INT("separators not defined", procName, 1);
1113
1114 cstr = stringNew(str); /* preserves const-ness of input str */
1115 substr = strtokSafe(cstr, separators, &saveptr);
1116 if (substr)
1117 sarrayAddString(sa, substr, L_INSERT);
1118 while ((substr = strtokSafe(NULL, separators, &saveptr)))
1119 sarrayAddString(sa, substr, L_INSERT);
1120 LEPT_FREE(cstr);
1121
1122 return 0;
1123 }
1124
1125
1126 /*----------------------------------------------------------------------*
1127 * Filter sarray *
1128 *----------------------------------------------------------------------*/
1129 /*!
1130 * \brief sarraySelectBySubstring()
1131 *
1132 * \param[in] sain input sarray
1133 * \param[in] substr [optional] substring for matching; can be NULL
1134 * \return saout output sarray, filtered with substring or NULL on error
1135 *
1136 * <pre>
1137 * Notes:
1138 * (1) This selects all strings in sain that have substr as a substring.
1139 * Note that we can't use strncmp() because we're looking for
1140 * a match to the substring anywhere within each filename.
1141 * (2) If substr == NULL, returns a copy of the sarray.
1142 * </pre>
1143 */
1144 SARRAY *
sarraySelectBySubstring(SARRAY * sain,const char * substr)1145 sarraySelectBySubstring(SARRAY *sain,
1146 const char *substr)
1147 {
1148 char *str;
1149 l_int32 n, i, offset, found;
1150 SARRAY *saout;
1151
1152 PROCNAME("sarraySelectBySubstring");
1153
1154 if (!sain)
1155 return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL);
1156
1157 n = sarrayGetCount(sain);
1158 if (!substr || n == 0)
1159 return sarrayCopy(sain);
1160
1161 saout = sarrayCreate(n);
1162 for (i = 0; i < n; i++) {
1163 str = sarrayGetString(sain, i, L_NOCOPY);
1164 arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
1165 strlen(substr), &offset, &found);
1166 if (found)
1167 sarrayAddString(saout, str, L_COPY);
1168 }
1169
1170 return saout;
1171 }
1172
1173
1174 /*!
1175 * \brief sarraySelectByRange()
1176 *
1177 * \param[in] sain input sarray
1178 * \param[in] first index of first string to be selected
1179 * \param[in] last index of last string to be selected; use 0 to go to the
1180 * end of the sarray
1181 * \return saout output sarray, or NULL on error
1182 *
1183 * <pre>
1184 * Notes:
1185 * (1) This makes %saout consisting of copies of all strings in %sain
1186 * in the index set [first ... last]. Use %last == 0 to get all
1187 * strings from %first to the last string in the sarray.
1188 * </pre>
1189 */
1190 SARRAY *
sarraySelectByRange(SARRAY * sain,l_int32 first,l_int32 last)1191 sarraySelectByRange(SARRAY *sain,
1192 l_int32 first,
1193 l_int32 last)
1194 {
1195 char *str;
1196 l_int32 n, i;
1197 SARRAY *saout;
1198
1199 PROCNAME("sarraySelectByRange");
1200
1201 if (!sain)
1202 return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL);
1203 if (first < 0) first = 0;
1204 n = sarrayGetCount(sain);
1205 if (last <= 0) last = n - 1;
1206 if (last >= n) {
1207 L_WARNING("last > n - 1; setting to n - 1\n", procName);
1208 last = n - 1;
1209 }
1210 if (first > last)
1211 return (SARRAY *)ERROR_PTR("first must be >= last", procName, NULL);
1212
1213 saout = sarrayCreate(0);
1214 for (i = first; i <= last; i++) {
1215 str = sarrayGetString(sain, i, L_COPY);
1216 sarrayAddString(saout, str, L_INSERT);
1217 }
1218
1219 return saout;
1220 }
1221
1222
1223 /*!
1224 * \brief sarrayParseRange()
1225 *
1226 * \param[in] sa input sarray
1227 * \param[in] start index to start range search
1228 * \param[out] pactualstart index of actual start; may be > 'start'
1229 * \param[out] pend index of end
1230 * \param[out] pnewstart index of start of next range
1231 * \param[in] substr substring for matching at beginning of string
1232 * \param[in] loc byte offset within the string for the pattern; use
1233 * -1 if the location does not matter;
1234 * \return 0 if valid range found; 1 otherwise
1235 *
1236 * <pre>
1237 * Notes:
1238 * (1) This finds the range of the next set of strings in SA,
1239 * beginning the search at 'start', that does NOT have
1240 * the substring 'substr' either at the indicated location
1241 * in the string or anywhere in the string. The input
1242 * variable 'loc' is the specified offset within the string;
1243 * use -1 to indicate 'anywhere in the string'.
1244 * (2) Always check the return value to verify that a valid range
1245 * was found.
1246 * (3) If a valid range is not found, the values of actstart,
1247 * end and newstart are all set to the size of sa.
1248 * (4) If this is the last valid range, newstart returns the value n.
1249 * In use, this should be tested before calling the function.
1250 * (5) Usage example. To find all the valid ranges in a file
1251 * where the invalid lines begin with two dashes, copy each
1252 * line in the file to a string in an sarray, and do:
1253 * start = 0;
1254 * while (!sarrayParseRange(sa, start, &actstart, &end, &start,
1255 * "--", 0))
1256 * fprintf(stderr, "start = %d, end = %d\n", actstart, end);
1257 * </pre>
1258 */
1259 l_int32
sarrayParseRange(SARRAY * sa,l_int32 start,l_int32 * pactualstart,l_int32 * pend,l_int32 * pnewstart,const char * substr,l_int32 loc)1260 sarrayParseRange(SARRAY *sa,
1261 l_int32 start,
1262 l_int32 *pactualstart,
1263 l_int32 *pend,
1264 l_int32 *pnewstart,
1265 const char *substr,
1266 l_int32 loc)
1267 {
1268 char *str;
1269 l_int32 n, i, offset, found;
1270
1271 PROCNAME("sarrayParseRange");
1272
1273 if (!sa)
1274 return ERROR_INT("sa not defined", procName, 1);
1275 if (!pactualstart || !pend || !pnewstart)
1276 return ERROR_INT("not all range addresses defined", procName, 1);
1277 n = sarrayGetCount(sa);
1278 *pactualstart = *pend = *pnewstart = n;
1279 if (!substr)
1280 return ERROR_INT("substr not defined", procName, 1);
1281
1282 /* Look for the first string without the marker */
1283 if (start < 0 || start >= n)
1284 return 1;
1285 for (i = start; i < n; i++) {
1286 str = sarrayGetString(sa, i, L_NOCOPY);
1287 arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
1288 strlen(substr), &offset, &found);
1289 if (loc < 0) {
1290 if (!found) break;
1291 } else {
1292 if (!found || offset != loc) break;
1293 }
1294 }
1295 start = i;
1296 if (i == n) /* couldn't get started */
1297 return 1;
1298
1299 /* Look for the last string without the marker */
1300 *pactualstart = start;
1301 for (i = start + 1; i < n; i++) {
1302 str = sarrayGetString(sa, i, L_NOCOPY);
1303 arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
1304 strlen(substr), &offset, &found);
1305 if (loc < 0) {
1306 if (found) break;
1307 } else {
1308 if (found && offset == loc) break;
1309 }
1310 }
1311 *pend = i - 1;
1312 start = i;
1313 if (i == n) /* no further range */
1314 return 0;
1315
1316 /* Look for the first string after *pend without the marker.
1317 * This will start the next run of strings, if it exists. */
1318 for (i = start; i < n; i++) {
1319 str = sarrayGetString(sa, i, L_NOCOPY);
1320 arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
1321 strlen(substr), &offset, &found);
1322 if (loc < 0) {
1323 if (!found) break;
1324 } else {
1325 if (!found || offset != loc) break;
1326 }
1327 }
1328 if (i < n)
1329 *pnewstart = i;
1330
1331 return 0;
1332 }
1333
1334
1335 /*----------------------------------------------------------------------*
1336 * Serialize for I/O *
1337 *----------------------------------------------------------------------*/
1338 /*!
1339 * \brief sarrayRead()
1340 *
1341 * \param[in] filename
1342 * \return sarray, or NULL on error
1343 */
1344 SARRAY *
sarrayRead(const char * filename)1345 sarrayRead(const char *filename)
1346 {
1347 FILE *fp;
1348 SARRAY *sa;
1349
1350 PROCNAME("sarrayRead");
1351
1352 if (!filename)
1353 return (SARRAY *)ERROR_PTR("filename not defined", procName, NULL);
1354
1355 if ((fp = fopenReadStream(filename)) == NULL)
1356 return (SARRAY *)ERROR_PTR("stream not opened", procName, NULL);
1357 sa = sarrayReadStream(fp);
1358 fclose(fp);
1359 if (!sa)
1360 return (SARRAY *)ERROR_PTR("sa not read", procName, NULL);
1361 return sa;
1362 }
1363
1364
1365 /*!
1366 * \brief sarrayReadStream()
1367 *
1368 * \param[in] fp file stream
1369 * \return sarray, or NULL on error
1370 *
1371 * <pre>
1372 * Notes:
1373 * (1) We store the size of each string along with the string.
1374 * The limit on the number of strings is 2^24.
1375 * The limit on the size of any string is 2^30 bytes.
1376 * (2) This allows a string to have embedded newlines. By reading
1377 * the entire string, as determined by its size, we are
1378 * not affected by any number of embedded newlines.
1379 * </pre>
1380 */
1381 SARRAY *
sarrayReadStream(FILE * fp)1382 sarrayReadStream(FILE *fp)
1383 {
1384 char *stringbuf;
1385 l_int32 i, n, size, index, bufsize, version, ignore, success;
1386 SARRAY *sa;
1387
1388 PROCNAME("sarrayReadStream");
1389
1390 if (!fp)
1391 return (SARRAY *)ERROR_PTR("stream not defined", procName, NULL);
1392
1393 if (fscanf(fp, "\nSarray Version %d\n", &version) != 1)
1394 return (SARRAY *)ERROR_PTR("not an sarray file", procName, NULL);
1395 if (version != SARRAY_VERSION_NUMBER)
1396 return (SARRAY *)ERROR_PTR("invalid sarray version", procName, NULL);
1397 if (fscanf(fp, "Number of strings = %d\n", &n) != 1)
1398 return (SARRAY *)ERROR_PTR("error on # strings", procName, NULL);
1399 if (n > (1 << 24))
1400 return (SARRAY *)ERROR_PTR("more than 2^24 strings!", procName, NULL);
1401
1402 success = TRUE;
1403 if ((sa = sarrayCreate(n)) == NULL)
1404 return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
1405 bufsize = L_BUF_SIZE + 1;
1406 stringbuf = (char *)LEPT_CALLOC(bufsize, sizeof(char));
1407
1408 for (i = 0; i < n; i++) {
1409 /* Get the size of the stored string */
1410 if ((fscanf(fp, "%d[%d]:", &index, &size) != 2) || (size > (1 << 30))) {
1411 success = FALSE;
1412 L_ERROR("error on string size\n", procName);
1413 goto cleanup;
1414 }
1415 /* Expand the string buffer if necessary */
1416 if (size > bufsize - 5) {
1417 LEPT_FREE(stringbuf);
1418 bufsize = (l_int32)(1.5 * size);
1419 stringbuf = (char *)LEPT_CALLOC(bufsize, sizeof(char));
1420 }
1421 /* Read the stored string, plus leading spaces and trailing \n */
1422 if (fread(stringbuf, 1, size + 3, fp) != size + 3) {
1423 success = FALSE;
1424 L_ERROR("error reading string\n", procName);
1425 goto cleanup;
1426 }
1427 /* Remove the \n that was added by sarrayWriteStream() */
1428 stringbuf[size + 2] = '\0';
1429 /* Copy it in, skipping the 2 leading spaces */
1430 sarrayAddString(sa, stringbuf + 2, L_COPY);
1431 }
1432 ignore = fscanf(fp, "\n");
1433
1434 cleanup:
1435 LEPT_FREE(stringbuf);
1436 if (!success) sarrayDestroy(&sa);
1437 return sa;
1438 }
1439
1440
1441 /*!
1442 * \brief sarrayReadMem()
1443 *
1444 * \param[in] data serialization in ascii
1445 * \param[in] size of data; can use strlen to get it
1446 * \return sarray, or NULL on error
1447 */
1448 SARRAY *
sarrayReadMem(const l_uint8 * data,size_t size)1449 sarrayReadMem(const l_uint8 *data,
1450 size_t size)
1451 {
1452 FILE *fp;
1453 SARRAY *sa;
1454
1455 PROCNAME("sarrayReadMem");
1456
1457 if (!data)
1458 return (SARRAY *)ERROR_PTR("data not defined", procName, NULL);
1459 if ((fp = fopenReadFromMemory(data, size)) == NULL)
1460 return (SARRAY *)ERROR_PTR("stream not opened", procName, NULL);
1461
1462 sa = sarrayReadStream(fp);
1463 fclose(fp);
1464 if (!sa) L_ERROR("sarray not read\n", procName);
1465 return sa;
1466 }
1467
1468
1469 /*!
1470 * \brief sarrayWrite()
1471 *
1472 * \param[in] filename
1473 * \param[in] sa string array
1474 * \return 0 if OK; 1 on error
1475 */
1476 l_int32
sarrayWrite(const char * filename,SARRAY * sa)1477 sarrayWrite(const char *filename,
1478 SARRAY *sa)
1479 {
1480 l_int32 ret;
1481 FILE *fp;
1482
1483 PROCNAME("sarrayWrite");
1484
1485 if (!filename)
1486 return ERROR_INT("filename not defined", procName, 1);
1487 if (!sa)
1488 return ERROR_INT("sa not defined", procName, 1);
1489
1490 if ((fp = fopenWriteStream(filename, "w")) == NULL)
1491 return ERROR_INT("stream not opened", procName, 1);
1492 ret = sarrayWriteStream(fp, sa);
1493 fclose(fp);
1494 if (ret)
1495 return ERROR_INT("sa not written to stream", procName, 1);
1496 return 0;
1497 }
1498
1499
1500 /*!
1501 * \brief sarrayWriteStream()
1502 *
1503 * \param[in] fp file stream
1504 * \param[in] sa string array
1505 * \return 0 if OK; 1 on error
1506 *
1507 * <pre>
1508 * Notes:
1509 * (1) This appends a '\n' to each string, which is stripped
1510 * off by sarrayReadStream().
1511 * </pre>
1512 */
1513 l_int32
sarrayWriteStream(FILE * fp,SARRAY * sa)1514 sarrayWriteStream(FILE *fp,
1515 SARRAY *sa)
1516 {
1517 l_int32 i, n, len;
1518
1519 PROCNAME("sarrayWriteStream");
1520
1521 if (!fp)
1522 return ERROR_INT("stream not defined", procName, 1);
1523 if (!sa)
1524 return ERROR_INT("sa not defined", procName, 1);
1525
1526 n = sarrayGetCount(sa);
1527 fprintf(fp, "\nSarray Version %d\n", SARRAY_VERSION_NUMBER);
1528 fprintf(fp, "Number of strings = %d\n", n);
1529 for (i = 0; i < n; i++) {
1530 len = strlen(sa->array[i]);
1531 fprintf(fp, " %d[%d]: %s\n", i, len, sa->array[i]);
1532 }
1533 fprintf(fp, "\n");
1534
1535 return 0;
1536 }
1537
1538
1539 /*!
1540 * \brief sarrayWriteMem()
1541 *
1542 * \param[out] pdata data of serialized sarray; ascii
1543 * \param[out] psize size of returned data
1544 * \param[in] sa
1545 * \return 0 if OK, 1 on error
1546 *
1547 * <pre>
1548 * Notes:
1549 * (1) Serializes a sarray in memory and puts the result in a buffer.
1550 * </pre>
1551 */
1552 l_int32
sarrayWriteMem(l_uint8 ** pdata,size_t * psize,SARRAY * sa)1553 sarrayWriteMem(l_uint8 **pdata,
1554 size_t *psize,
1555 SARRAY *sa)
1556 {
1557 l_int32 ret;
1558 FILE *fp;
1559
1560 PROCNAME("sarrayWriteMem");
1561
1562 if (pdata) *pdata = NULL;
1563 if (psize) *psize = 0;
1564 if (!pdata)
1565 return ERROR_INT("&data not defined", procName, 1);
1566 if (!psize)
1567 return ERROR_INT("&size not defined", procName, 1);
1568 if (!sa)
1569 return ERROR_INT("sa not defined", procName, 1);
1570
1571 #if HAVE_FMEMOPEN
1572 if ((fp = open_memstream((char **)pdata, psize)) == NULL)
1573 return ERROR_INT("stream not opened", procName, 1);
1574 ret = sarrayWriteStream(fp, sa);
1575 #else
1576 L_INFO("work-around: writing to a temp file\n", procName);
1577 #ifdef _WIN32
1578 if ((fp = fopenWriteWinTempfile()) == NULL)
1579 return ERROR_INT("tmpfile stream not opened", procName, 1);
1580 #else
1581 if ((fp = tmpfile()) == NULL)
1582 return ERROR_INT("tmpfile stream not opened", procName, 1);
1583 #endif /* _WIN32 */
1584 ret = sarrayWriteStream(fp, sa);
1585 rewind(fp);
1586 *pdata = l_binaryReadStream(fp, psize);
1587 #endif /* HAVE_FMEMOPEN */
1588 fclose(fp);
1589 return ret;
1590 }
1591
1592
1593 /*!
1594 * \brief sarrayAppend()
1595 *
1596 * \param[in] filename
1597 * \param[in] sa
1598 * \return 0 if OK; 1 on error
1599 */
1600 l_int32
sarrayAppend(const char * filename,SARRAY * sa)1601 sarrayAppend(const char *filename,
1602 SARRAY *sa)
1603 {
1604 FILE *fp;
1605
1606 PROCNAME("sarrayAppend");
1607
1608 if (!filename)
1609 return ERROR_INT("filename not defined", procName, 1);
1610 if (!sa)
1611 return ERROR_INT("sa not defined", procName, 1);
1612
1613 if ((fp = fopenWriteStream(filename, "a")) == NULL)
1614 return ERROR_INT("stream not opened", procName, 1);
1615 if (sarrayWriteStream(fp, sa)) {
1616 fclose(fp);
1617 return ERROR_INT("sa not appended to stream", procName, 1);
1618 }
1619
1620 fclose(fp);
1621 return 0;
1622 }
1623
1624
1625 /*---------------------------------------------------------------------*
1626 * Directory filenames *
1627 *---------------------------------------------------------------------*/
1628 /*!
1629 * \brief getNumberedPathnamesInDirectory()
1630 *
1631 * \param[in] dirname directory name
1632 * \param[in] substr [optional] substring filter on filenames; can be NULL
1633 * \param[in] numpre number of characters in name before number
1634 * \param[in] numpost number of characters in name after the number,
1635 * up to a dot before an extension
1636 * \param[in] maxnum only consider page numbers up to this value
1637 * \return sarray of numbered pathnames, or NULL on error
1638 *
1639 * <pre>
1640 * Notes:
1641 * (1) Returns the full pathnames of the numbered filenames in
1642 * the directory. The number in the filename is the index
1643 * into the sarray. For indices for which there are no filenames,
1644 * an empty string ("") is placed into the sarray.
1645 * This makes reading numbered files very simple. For example,
1646 * the image whose filename includes number N can be retrieved using
1647 * pixReadIndexed(sa, N);
1648 * (2) If %substr is not NULL, only filenames that contain
1649 * the substring can be included. If %substr is NULL,
1650 * all matching filenames are used.
1651 * (3) If no numbered files are found, it returns an empty sarray,
1652 * with no initialized strings.
1653 * (4) It is assumed that the page number is contained within
1654 * the basename (the filename without directory or extension).
1655 * %numpre is the number of characters in the basename
1656 * preceding the actual page number; %numpost is the number
1657 * following the page number, up to either the end of the
1658 * basename or a ".", whichever comes first.
1659 * (5) This is useful when all filenames contain numbers that are
1660 * not necessarily consecutive. 0-padding is not required.
1661 * (6) To use a O(n) matching algorithm, the largest page number
1662 * is found and two internal arrays of this size are created.
1663 * This maximum is constrained not to exceed %maxsum,
1664 * to make sure that an unrealistically large number is not
1665 * accidentally used to determine the array sizes.
1666 * </pre>
1667 */
1668 SARRAY *
getNumberedPathnamesInDirectory(const char * dirname,const char * substr,l_int32 numpre,l_int32 numpost,l_int32 maxnum)1669 getNumberedPathnamesInDirectory(const char *dirname,
1670 const char *substr,
1671 l_int32 numpre,
1672 l_int32 numpost,
1673 l_int32 maxnum)
1674 {
1675 l_int32 nfiles;
1676 SARRAY *sa, *saout;
1677
1678 PROCNAME("getNumberedPathnamesInDirectory");
1679
1680 if (!dirname)
1681 return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
1682
1683 if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
1684 return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
1685 if ((nfiles = sarrayGetCount(sa)) == 0) {
1686 sarrayDestroy(&sa);
1687 return sarrayCreate(1);
1688 }
1689
1690 saout = convertSortedToNumberedPathnames(sa, numpre, numpost, maxnum);
1691 sarrayDestroy(&sa);
1692 return saout;
1693 }
1694
1695
1696 /*!
1697 * \brief getSortedPathnamesInDirectory()
1698 *
1699 * \param[in] dirname directory name
1700 * \param[in] substr [optional] substring filter on filenames; can be NULL
1701 * \param[in] first 0-based
1702 * \param[in] nfiles use 0 for all to the end
1703 * \return sarray of sorted pathnames, or NULL on error
1704 *
1705 * <pre>
1706 * Notes:
1707 * (1) Use %substr to filter filenames in the directory. If
1708 * %substr == NULL, this takes all files.
1709 * (2) The files in the directory, after optional filtering by
1710 * the substring, are lexically sorted in increasing order.
1711 * Use %first and %nfiles to select a contiguous set of files.
1712 * (3) The full pathnames are returned for the requested sequence.
1713 * If no files are found after filtering, returns an empty sarray.
1714 * </pre>
1715 */
1716 SARRAY *
getSortedPathnamesInDirectory(const char * dirname,const char * substr,l_int32 first,l_int32 nfiles)1717 getSortedPathnamesInDirectory(const char *dirname,
1718 const char *substr,
1719 l_int32 first,
1720 l_int32 nfiles)
1721 {
1722 char *fname, *fullname;
1723 l_int32 i, n, last;
1724 SARRAY *sa, *safiles, *saout;
1725
1726 PROCNAME("getSortedPathnamesInDirectory");
1727
1728 if (!dirname)
1729 return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
1730
1731 if ((sa = getFilenamesInDirectory(dirname)) == NULL)
1732 return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
1733 safiles = sarraySelectBySubstring(sa, substr);
1734 sarrayDestroy(&sa);
1735 n = sarrayGetCount(safiles);
1736 if (n == 0) {
1737 L_WARNING("no files found\n", procName);
1738 return safiles;
1739 }
1740
1741 sarraySort(safiles, safiles, L_SORT_INCREASING);
1742
1743 first = L_MIN(L_MAX(first, 0), n - 1);
1744 if (nfiles == 0)
1745 nfiles = n - first;
1746 last = L_MIN(first + nfiles - 1, n - 1);
1747
1748 saout = sarrayCreate(last - first + 1);
1749 for (i = first; i <= last; i++) {
1750 fname = sarrayGetString(safiles, i, L_NOCOPY);
1751 fullname = pathJoin(dirname, fname);
1752 sarrayAddString(saout, fullname, L_INSERT);
1753 }
1754
1755 sarrayDestroy(&safiles);
1756 return saout;
1757 }
1758
1759
1760 /*!
1761 * \brief convertSortedToNumberedPathnames()
1762 *
1763 * \param[in] sa sorted pathnames including zero-padded integers
1764 * \param[in] numpre number of characters in name before number
1765 * \param[in] numpost number of characters in name after the number,
1766 * up to a dot before an extension
1767 * \param[in] maxnum only consider page numbers up to this value
1768 * \return sarray of numbered pathnames, or NULL on error
1769 *
1770 * <pre>
1771 * Notes:
1772 * (1) Typically, numpre = numpost = 0; e.g., when the filename
1773 * just has a number followed by an optional extension.
1774 * </pre>
1775 */
1776 SARRAY *
convertSortedToNumberedPathnames(SARRAY * sa,l_int32 numpre,l_int32 numpost,l_int32 maxnum)1777 convertSortedToNumberedPathnames(SARRAY *sa,
1778 l_int32 numpre,
1779 l_int32 numpost,
1780 l_int32 maxnum)
1781 {
1782 char *fname, *str;
1783 l_int32 i, nfiles, num, index;
1784 SARRAY *saout;
1785
1786 PROCNAME("convertSortedToNumberedPathnames");
1787
1788 if (!sa)
1789 return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);
1790 if ((nfiles = sarrayGetCount(sa)) == 0)
1791 return sarrayCreate(1);
1792
1793 /* Find the last file in the sorted array that has a number
1794 * that (a) matches the count pattern and (b) does not
1795 * exceed %maxnum. %maxnum sets an upper limit on the size
1796 * of the sarray. */
1797 num = 0;
1798 for (i = nfiles - 1; i >= 0; i--) {
1799 fname = sarrayGetString(sa, i, L_NOCOPY);
1800 num = extractNumberFromFilename(fname, numpre, numpost);
1801 if (num < 0) continue;
1802 num = L_MIN(num + 1, maxnum);
1803 break;
1804 }
1805
1806 if (num <= 0) /* none found */
1807 return sarrayCreate(1);
1808
1809 /* Insert pathnames into the output sarray.
1810 * Ignore numbers that are out of the range of sarray. */
1811 saout = sarrayCreateInitialized(num, (char *)"");
1812 for (i = 0; i < nfiles; i++) {
1813 fname = sarrayGetString(sa, i, L_NOCOPY);
1814 index = extractNumberFromFilename(fname, numpre, numpost);
1815 if (index < 0 || index >= num) continue;
1816 str = sarrayGetString(saout, index, L_NOCOPY);
1817 if (str[0] != '\0')
1818 L_WARNING("\n Multiple files with same number: %d\n",
1819 procName, index);
1820 sarrayReplaceString(saout, index, fname, L_COPY);
1821 }
1822
1823 return saout;
1824 }
1825
1826
1827 /*!
1828 * \brief getFilenamesInDirectory()
1829 *
1830 * \param[in] dirname directory name
1831 * \return sarray of file names, or NULL on error
1832 *
1833 * <pre>
1834 * Notes:
1835 * (1) The versions compiled under unix and cygwin use the POSIX C
1836 * library commands for handling directories. For windows,
1837 * there is a separate implementation.
1838 * (2) It returns an array of filename tails; i.e., only the part of
1839 * the path after the last slash.
1840 * (3) Use of the d_type field of dirent is not portable:
1841 * "According to POSIX, the dirent structure contains a field
1842 * char d_name[] of unspecified size, with at most NAME_MAX
1843 * characters preceding the terminating null character. Use
1844 * of other fields will harm the portability of your programs."
1845 * (4) As a consequence of (3), we note several things:
1846 * ~ MINGW doesn't have a d_type member.
1847 * ~ Older versions of gcc (e.g., 2.95.3) return DT_UNKNOWN
1848 * for d_type from all files.
1849 * On these systems, this function will return directories
1850 * (except for '.' and '..', which are eliminated using
1851 * the d_name field).
1852 * </pre>
1853 */
1854
1855 #ifndef _WIN32
1856
1857 SARRAY *
getFilenamesInDirectory(const char * dirname)1858 getFilenamesInDirectory(const char *dirname)
1859 {
1860 char dir[PATH_MAX + 1];
1861 char *realdir, *stat_path, *ignore;
1862 size_t size;
1863 SARRAY *safiles;
1864 DIR *pdir;
1865 struct dirent *pdirentry;
1866 int dfd, stat_ret;
1867 struct stat st;
1868
1869 PROCNAME("getFilenamesInDirectory");
1870
1871 if (!dirname)
1872 return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
1873
1874 /* It's nice to ignore directories. fstatat() works with relative
1875 directory paths, but stat() requires using the absolute path.
1876 Also, do not pass NULL as the second parameter to realpath();
1877 use a buffer of sufficient size. */
1878 ignore = realpath(dirname, dir); /* see note above */
1879 realdir = genPathname(dir, NULL);
1880 if ((pdir = opendir(realdir)) == NULL) {
1881 LEPT_FREE(realdir);
1882 return (SARRAY *)ERROR_PTR("pdir not opened", procName, NULL);
1883 }
1884 safiles = sarrayCreate(0);
1885 dfd = dirfd(pdir);
1886 while ((pdirentry = readdir(pdir))) {
1887 #if HAVE_FSTATAT
1888 stat_ret = fstatat(dfd, pdirentry->d_name, &st, 0);
1889 #else
1890 size = strlen(realdir) + strlen(pdirentry->d_name) + 2;
1891 if (size > PATH_MAX) {
1892 L_ERROR("size = %lu too large; skipping\n", procName,
1893 (unsigned long)size);
1894 continue;
1895 }
1896 stat_path = (char *)LEPT_CALLOC(size, 1);
1897 snprintf(stat_path, size, "%s/%s", realdir, pdirentry->d_name);
1898 stat_ret = stat(stat_path, &st);
1899 LEPT_FREE(stat_path);
1900 #endif
1901 if (stat_ret == 0 && S_ISDIR(st.st_mode))
1902 continue;
1903 sarrayAddString(safiles, pdirentry->d_name, L_COPY);
1904 }
1905 closedir(pdir);
1906 LEPT_FREE(realdir);
1907 return safiles;
1908 }
1909
1910 #else /* _WIN32 */
1911
1912 /* http://msdn2.microsoft.com/en-us/library/aa365200(VS.85).aspx */
1913 #include <windows.h>
1914
1915 SARRAY *
getFilenamesInDirectory(const char * dirname)1916 getFilenamesInDirectory(const char *dirname)
1917 {
1918 char *pszDir;
1919 char *realdir;
1920 HANDLE hFind = INVALID_HANDLE_VALUE;
1921 SARRAY *safiles;
1922 WIN32_FIND_DATAA ffd;
1923
1924 PROCNAME("getFilenamesInDirectory");
1925
1926 if (!dirname)
1927 return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
1928
1929 realdir = genPathname(dirname, NULL);
1930 pszDir = stringJoin(realdir, "\\*");
1931 LEPT_FREE(realdir);
1932
1933 if (strlen(pszDir) + 1 > MAX_PATH) {
1934 LEPT_FREE(pszDir);
1935 return (SARRAY *)ERROR_PTR("dirname is too long", procName, NULL);
1936 }
1937
1938 if ((safiles = sarrayCreate(0)) == NULL) {
1939 LEPT_FREE(pszDir);
1940 return (SARRAY *)ERROR_PTR("safiles not made", procName, NULL);
1941 }
1942
1943 hFind = FindFirstFileA(pszDir, &ffd);
1944 if (INVALID_HANDLE_VALUE == hFind) {
1945 sarrayDestroy(&safiles);
1946 LEPT_FREE(pszDir);
1947 return (SARRAY *)ERROR_PTR("hFind not opened", procName, NULL);
1948 }
1949
1950 while (FindNextFileA(hFind, &ffd) != 0) {
1951 if (ffd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) /* skip dirs */
1952 continue;
1953 convertSepCharsInPath(ffd.cFileName, UNIX_PATH_SEPCHAR);
1954 sarrayAddString(safiles, ffd.cFileName, L_COPY);
1955 }
1956
1957 FindClose(hFind);
1958 LEPT_FREE(pszDir);
1959 return safiles;
1960 }
1961 #endif /* _WIN32 */
1962