1 /*====================================================================*
2  -  Copyright (C) 2001 Leptonica.  All rights reserved.
3  -
4  -  Redistribution and use in source and binary forms, with or without
5  -  modification, are permitted provided that the following conditions
6  -  are met:
7  -  1. Redistributions of source code must retain the above copyright
8  -     notice, this list of conditions and the following disclaimer.
9  -  2. Redistributions in binary form must reproduce the above
10  -     copyright notice, this list of conditions and the following
11  -     disclaimer in the documentation and/or other materials
12  -     provided with the distribution.
13  -
14  -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15  -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16  -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17  -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
18  -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23  -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *====================================================================*/
26 
27 /*!
28  * \file utils2.c
29  * <pre>
30  *
31  *       Safe string procs
32  *           char      *stringNew()
33  *           l_int32    stringCopy()
34  *           l_int32    stringReplace()
35  *           l_int32    stringLength()
36  *           l_int32    stringCat()
37  *           char      *stringConcatNew()
38  *           char      *stringJoin()
39  *           l_int32    stringJoinIP()
40  *           char      *stringReverse()
41  *           char      *strtokSafe()
42  *           l_int32    stringSplitOnToken()
43  *
44  *       Find and replace string and array procs
45  *           l_int32    stringCheckForChars()
46  *           char      *stringRemoveChars()
47  *           l_int32    stringFindSubstr()
48  *           char      *stringReplaceSubstr()
49  *           char      *stringReplaceEachSubstr()
50  *           L_DNA     *arrayFindEachSequence()
51  *           l_int32    arrayFindSequence()
52  *
53  *       Safe realloc
54  *           void      *reallocNew()
55  *
56  *       Read and write between file and memory
57  *           l_uint8   *l_binaryRead()
58  *           l_uint8   *l_binaryReadStream()
59  *           l_uint8   *l_binaryReadSelect()
60  *           l_uint8   *l_binaryReadSelectStream()
61  *           l_int32    l_binaryWrite()
62  *           l_int32    nbytesInFile()
63  *           l_int32    fnbytesInFile()
64  *
65  *       Copy in memory
66  *           l_uint8   *l_binaryCopy()
67  *
68  *       File copy operations
69  *           l_int32    fileCopy()
70  *           l_int32    fileConcatenate()
71  *           l_int32    fileAppendString()
72  *
73  *       Multi-platform functions for opening file streams
74  *           FILE      *fopenReadStream()
75  *           FILE      *fopenWriteStream()
76  *           FILE      *fopenReadFromMemory()
77  *
78  *       Opening a windows tmpfile for writing
79  *           FILE      *fopenWriteWinTempfile()
80  *
81  *       Multi-platform functions that avoid C-runtime boundary crossing
82  *       with Windows DLLs
83  *           FILE      *lept_fopen()
84  *           l_int32    lept_fclose()
85  *           void       lept_calloc()
86  *           void       lept_free()
87  *
88  *       Multi-platform file system operations in temp directories
89  *           l_int32    lept_mkdir()
90  *           l_int32    lept_rmdir()
91  *           l_int32    lept_direxists()
92  *           l_int32    lept_mv()
93  *           l_int32    lept_rm_match()
94  *           l_int32    lept_rm()
95  *           l_int32    lept_rmfile()
96  *           l_int32    lept_cp()
97  *
98  *       General file name operations
99  *           l_int32    splitPathAtDirectory()
100  *           l_int32    splitPathAtExtension()
101  *           char      *pathJoin()
102  *           char      *appendSubdirs()
103  *
104  *       Special file name operations
105  *           l_int32    convertSepCharsInPath()
106  *           char      *genPathname()
107  *           l_int32    makeTempDirname()
108  *           l_int32    modifyTrailingSlash()
109  *           char      *l_makeTempFilename()
110  *           l_int32    extractNumberFromFilename()
111  *
112  *
113  *  Notes on multi-platform development
114  *  -----------------------------------
115  *  This is important:
116  *  (1) With the exception of splitPathAtDirectory(), splitPathAtExtension()
117   *     and genPathname(), all input pathnames must have unix separators.
118  *  (2) On Windows, when you specify a read or write to "/tmp/...",
119  *      the filename is rewritten to use the Windows temp directory:
120  *         /tmp  ==>   <Temp>...    (windows)
121  *  (3) This filename rewrite, along with the conversion from unix
122  *      to windows pathnames, happens in genPathname().
123  *  (4) Use fopenReadStream() and fopenWriteStream() to open files,
124  *      because these use genPathname() to find the platform-dependent
125  *      filenames.  Likewise for l_binaryRead() and l_binaryWrite().
126  *  (5) For moving, copying and removing files and directories that are in
127  *      subdirectories of /tmp, use the lept_*() file system shell wrappers:
128  *         lept_mkdir(), lept_rmdir(), lept_mv(), lept_rm() and lept_cp().
129  *  (6) Use the lept_*() C library wrappers.  These work properly on
130  *      Windows, where the same DLL must perform complementary operations
131  *      on file streams (open/close) and heap memory (malloc/free):
132  *         lept_fopen(), lept_fclose(), lept_calloc() and lept_free().
133  *  (7) Why read and write files to temp directories?
134  *      The library needs the ability to read and write ephemeral
135  *      files to default places, both for generating debugging output
136  *      and for supporting regression tests.  Applications also need
137  *      this ability for debugging.
138  *  (8) Why do the pathname rewrite on Windows?
139  *      The goal is to have the library, and programs using the library,
140  *      run on multiple platforms without changes.  The location of
141  *      temporary files depends on the platform as well as the user's
142  *      configuration.  Temp files on Windows are in some directory
143  *      not known a priori.  To make everything work seamlessly on
144  *      Windows, every time you open a file for reading or writing,
145  *      use a special function such as fopenReadStream() or
146  *      fopenWriteStream(); these call genPathname() to ensure that
147  *      if it is a temp file, the correct path is used.  To indicate
148  *      that this is a temp file, the application is written with the
149  *      root directory of the path in a canonical form: "/tmp".
150  *  (9) Why is it that multi-platform directory functions like lept_mkdir()
151  *      and lept_rmdir(), as well as associated file functions like
152  *      lept_rm(), lept_mv() and lept_cp(), only work in the temp dir?
153  *      These functions were designed to provide easy manipulation of
154  *      temp files.  The restriction to temp files is for safety -- to
155  *      prevent an accidental deletion of important files.  For example,
156  *      lept_rmdir() first deletes all files in a specified subdirectory
157  *      of temp, and then removes the directory.
158  *
159  * </pre>
160  */
161 
162 #ifdef HAVE_CONFIG_H
163 #include "config_auto.h"
164 #endif  /* HAVE_CONFIG_H */
165 
166 #ifdef _MSC_VER
167 #include <process.h>
168 #include <direct.h>
169 #else
170 #include <unistd.h>
171 #endif   /* _MSC_VER */
172 
173 #ifdef _WIN32
174 #include <windows.h>
175 #include <fcntl.h>     /* _O_CREAT, ... */
176 #include <io.h>        /* _open */
177 #include <sys/stat.h>  /* _S_IREAD, _S_IWRITE */
178 #else
179 #include <sys/stat.h>  /* for stat, mkdir(2) */
180 #include <sys/types.h>
181 #endif
182 
183 #include <string.h>
184 #include <stddef.h>
185 #include "allheaders.h"
186 
187 /*  This is only used to test "/tmp" --> TMPDIR rewriting on Windows,
188  *  by emulating it in unix.  It should never be on in production. */
189 #define DEBUG_REWRITE    0
190 
191 
192 /*--------------------------------------------------------------------*
193  *                       Safe string operations                       *
194  *--------------------------------------------------------------------*/
195 /*!
196  * \brief   stringNew()
197  *
198  * \param[in]    src string
199  * \return  dest copy of src string, or NULL on error
200  */
201 char *
stringNew(const char * src)202 stringNew(const char  *src)
203 {
204 l_int32  len;
205 char    *dest;
206 
207     PROCNAME("stringNew");
208 
209     if (!src) {
210         L_WARNING("src not defined\n", procName);
211         return NULL;
212     }
213 
214     len = strlen(src);
215     if ((dest = (char *)LEPT_CALLOC(len + 1, sizeof(char))) == NULL)
216         return (char *)ERROR_PTR("dest not made", procName, NULL);
217 
218     stringCopy(dest, src, len);
219     return dest;
220 }
221 
222 
223 /*!
224  * \brief   stringCopy()
225  *
226  * \param[in]    dest existing byte buffer
227  * \param[in]    src string [optional] can be null
228  * \param[in]    n max number of characters to copy
229  * \return  0 if OK, 1 on error
230  *
231  * <pre>
232  * Notes:
233  *      (1) Relatively safe wrapper for strncpy, that checks the input,
234  *          and does not complain if %src is null or %n < 1.
235  *          If %n < 1, this is a no-op.
236  *      (2) %dest needs to be at least %n bytes in size.
237  *      (3) We don't call strncpy() because valgrind complains about
238  *          use of uninitialized values.
239  * </pre>
240  */
241 l_int32
stringCopy(char * dest,const char * src,l_int32 n)242 stringCopy(char        *dest,
243            const char  *src,
244            l_int32      n)
245 {
246 l_int32  i;
247 
248     PROCNAME("stringCopy");
249 
250     if (!dest)
251         return ERROR_INT("dest not defined", procName, 1);
252     if (!src || n < 1)
253         return 0;
254 
255         /* Implementation of strncpy that valgrind doesn't complain about */
256     for (i = 0; i < n && src[i] != '\0'; i++)
257         dest[i] = src[i];
258     for (; i < n; i++)
259         dest[i] = '\0';
260     return 0;
261 }
262 
263 
264 /*!
265  * \brief   stringReplace()
266  *
267  * \param[out]   pdest string copy
268  * \param[in]    src string [optional] can be null
269  * \return  0 if OK; 1 on error
270  *
271  * <pre>
272  * Notes:
273  *      (1) Frees any existing dest string
274  *      (2) Puts a copy of src string in the dest
275  *      (3) If either or both strings are null, does something reasonable.
276  * </pre>
277  */
278 l_int32
stringReplace(char ** pdest,const char * src)279 stringReplace(char       **pdest,
280               const char  *src)
281 {
282     PROCNAME("stringReplace");
283 
284     if (!pdest)
285         return ERROR_INT("pdest not defined", procName, 1);
286 
287     if (*pdest)
288         LEPT_FREE(*pdest);
289 
290     if (src)
291         *pdest = stringNew(src);
292     else
293         *pdest = NULL;
294     return 0;
295 }
296 
297 
298 /*!
299  * \brief   stringLength()
300  *
301  * \param[in]    src string can be null or NULL-terminated string
302  * \param[in]    size size of src buffer
303  * \return  length of src in bytes.
304  *
305  * <pre>
306  * Notes:
307  *      (1) Safe implementation of strlen that only checks size bytes
308  *          for trailing NUL.
309  *      (2) Valid returned string lengths are between 0 and size - 1.
310  *          If size bytes are checked without finding a NUL byte, then
311  *          an error is indicated by returning size.
312  * </pre>
313  */
314 l_int32
stringLength(const char * src,size_t size)315 stringLength(const char  *src,
316              size_t       size)
317 {
318 l_int32  i;
319 
320     PROCNAME("stringLength");
321 
322     if (!src)
323         return ERROR_INT("src not defined", procName, 0);
324     if (size < 1)
325         return 0;
326 
327     for (i = 0; i < size; i++) {
328         if (src[i] == '\0')
329             return i;
330     }
331     return size;  /* didn't find a NUL byte */
332 }
333 
334 
335 /*!
336  * \brief   stringCat()
337  *
338  * \param[in]    dest null-terminated byte buffer
339  * \param[in]    size size of dest
340  * \param[in]    src string can be null or NULL-terminated string
341  * \return  number of bytes added to dest; -1 on error
342  *
343  * <pre>
344  * Notes:
345  *      (1) Alternative implementation of strncat, that checks the input,
346  *          is easier to use (since the size of the dest buffer is specified
347  *          rather than the number of bytes to copy), and does not complain
348  *          if %src is null.
349  *      (2) Never writes past end of dest.
350  *      (3) If it can't append src (an error), it does nothing.
351  *      (4) N.B. The order of 2nd and 3rd args is reversed from that in
352  *          strncat, as in the Windows function strcat_s().
353  * </pre>
354  */
355 l_int32
stringCat(char * dest,size_t size,const char * src)356 stringCat(char        *dest,
357           size_t       size,
358           const char  *src)
359 {
360 l_int32  i, n;
361 l_int32  lendest, lensrc;
362 
363     PROCNAME("stringCat");
364 
365     if (!dest)
366         return ERROR_INT("dest not defined", procName, -1);
367     if (size < 1)
368         return ERROR_INT("size < 1; too small", procName, -1);
369     if (!src)
370         return 0;
371 
372     lendest = stringLength(dest, size);
373     if (lendest == size)
374         return ERROR_INT("no terminating nul byte", procName, -1);
375     lensrc = stringLength(src, size);
376     if (lensrc == 0)
377         return 0;
378     n = (lendest + lensrc > size - 1 ? size - lendest - 1 : lensrc);
379     if (n < 1)
380         return ERROR_INT("dest too small for append", procName, -1);
381 
382     for (i = 0; i < n; i++)
383         dest[lendest + i] = src[i];
384     dest[lendest + n] = '\0';
385     return n;
386 }
387 
388 
389 /*!
390  * \brief   stringConcatNew()
391  *
392  * \param[in]    first first string in list
393  * \param[in]    ...  NULL-terminated list of strings
394  * \return  result new string concatenating the input strings, or
395  *                      NULL if first == NULL
396  *
397  * <pre>
398  * Notes:
399  *      (1) The last arg in the list of strings must be NULL.
400  *      (2) Caller must free the returned string.
401  * </pre>
402  */
403 char *
stringConcatNew(const char * first,...)404 stringConcatNew(const char  *first, ...)
405 {
406 size_t       len;
407 char        *result, *ptr;
408 const char  *arg;
409 va_list      args;
410 
411     if (!first) return NULL;
412 
413         /* Find the length of the output string */
414     va_start(args, first);
415     len = strlen(first);
416     while ((arg = va_arg(args, const char *)) != NULL)
417         len += strlen(arg);
418     va_end(args);
419     result = (char *)LEPT_CALLOC(len + 1, sizeof(char));
420 
421         /* Concatenate the args */
422     va_start(args, first);
423     ptr = result;
424     arg = first;
425     while (*arg)
426         *ptr++ = *arg++;
427     while ((arg = va_arg(args, const char *)) != NULL) {
428         while (*arg)
429             *ptr++ = *arg++;
430     }
431     va_end(args);
432     return result;
433 }
434 
435 
436 /*!
437  * \brief   stringJoin()
438  *
439  * \param[in]    src1 string [optional] can be null
440  * \param[in]    src2 string [optional] can be null
441  * \return  concatenated string, or NULL on error
442  *
443  * <pre>
444  * Notes:
445  *      (1) This is a safe version of strcat; it makes a new string.
446  *      (2) It is not an error if either or both of the strings
447  *          are empty, or if either or both of the pointers are null.
448  * </pre>
449  */
450 char *
stringJoin(const char * src1,const char * src2)451 stringJoin(const char  *src1,
452            const char  *src2)
453 {
454 char    *dest;
455 l_int32  srclen1, srclen2, destlen;
456 
457     PROCNAME("stringJoin");
458 
459     srclen1 = (src1) ? strlen(src1) : 0;
460     srclen2 = (src2) ? strlen(src2) : 0;
461     destlen = srclen1 + srclen2 + 3;
462 
463     if ((dest = (char *)LEPT_CALLOC(destlen, sizeof(char))) == NULL)
464         return (char *)ERROR_PTR("calloc fail for dest", procName, NULL);
465 
466     if (src1)
467         stringCopy(dest, src1, srclen1);
468     if (src2)
469         strncat(dest, src2, srclen2);
470     return dest;
471 }
472 
473 
474 /*!
475  * \brief   stringJoinIP()
476  *
477  * \param[in,out]  psrc1 string address of src1; cannot be on the stack
478  * \param[in]      src2 string [optional] can be null
479  * \return  0 if OK, 1 on error
480  *
481  * <pre>
482  * Notes:
483  *      (1) This is a safe in-place version of strcat.  The contents of
484  *          src1 is replaced by the concatenation of src1 and src2.
485  *      (2) It is not an error if either or both of the strings
486  *          are empty (""), or if the pointers to the strings (*psrc1, src2)
487  *          are null.
488  *      (3) src1 should be initialized to null or an empty string
489  *          before the first call.  Use one of these:
490  *              char *src1 = NULL;
491  *              char *src1 = stringNew("");
492  *          Then call with:
493  *              stringJoinIP(&src1, src2);
494  *      (4) This can also be implemented as a macro:
495  * \code
496  *              #define stringJoinIP(src1, src2) \
497  *                  {tmpstr = stringJoin((src1),(src2)); \
498  *                  LEPT_FREE(src1); \
499  *                  (src1) = tmpstr;}
500  * \endcode
501  *      (5) Another function to consider for joining many strings is
502  *          stringConcatNew().
503  * </pre>
504  */
505 l_int32
stringJoinIP(char ** psrc1,const char * src2)506 stringJoinIP(char       **psrc1,
507              const char  *src2)
508 {
509 char  *tmpstr;
510 
511     PROCNAME("stringJoinIP");
512 
513     if (!psrc1)
514         return ERROR_INT("&src1 not defined", procName, 1);
515 
516     tmpstr = stringJoin(*psrc1, src2);
517     LEPT_FREE(*psrc1);
518     *psrc1 = tmpstr;
519     return 0;
520 }
521 
522 
523 /*!
524  * \brief   stringReverse()
525  *
526  * \param[in]    src string
527  * \return  dest newly-allocated reversed string
528  */
529 char *
stringReverse(const char * src)530 stringReverse(const char  *src)
531 {
532 char    *dest;
533 l_int32  i, len;
534 
535     PROCNAME("stringReverse");
536 
537     if (!src)
538         return (char *)ERROR_PTR("src not defined", procName, NULL);
539     len = strlen(src);
540     if ((dest = (char *)LEPT_CALLOC(len + 1, sizeof(char))) == NULL)
541         return (char *)ERROR_PTR("calloc fail for dest", procName, NULL);
542     for (i = 0; i < len; i++)
543         dest[i] = src[len - 1 - i];
544 
545     return dest;
546 }
547 
548 
549 /*!
550  * \brief   strtokSafe()
551  *
552  * \param[in]    cstr input string to be sequentially parsed;
553  *                    use NULL after the first call
554  * \param[in]    seps a string of character separators
555  * \param[out]   psaveptr ptr to the next char after
556  *                        the last encountered separator
557  * \return  substr a new string that is copied from the previous
558  *                      saveptr up to but not including the next
559  *                      separator character, or NULL if end of cstr.
560  *
561  * <pre>
562  * Notes:
563  *      (1) This is a thread-safe implementation of strtok.
564  *      (2) It has the same interface as strtok_r.
565  *      (3) It differs from strtok_r in usage in two respects:
566  *          (a) the input string is not altered
567  *          (b) each returned substring is newly allocated and must
568  *              be freed after use.
569  *      (4) Let me repeat that.  This is "safe" because the input
570  *          string is not altered and because each returned string
571  *          is newly allocated on the heap.
572  *      (5) It is here because, surprisingly, some C libraries don't
573  *          include strtok_r.
574  *      (6) Important usage points:
575  *          ~ Input the string to be parsed on the first invocation.
576  *          ~ Then input NULL after that; the value returned in saveptr
577  *            is used in all subsequent calls.
578  *      (7) This is only slightly slower than strtok_k.
579  * </pre>
580  */
581 char *
strtokSafe(char * cstr,const char * seps,char ** psaveptr)582 strtokSafe(char        *cstr,
583            const char  *seps,
584            char       **psaveptr)
585 {
586 char     nextc;
587 char    *start, *substr;
588 l_int32  istart, i, j, nchars;
589 
590     PROCNAME("strtokSafe");
591 
592     if (!seps)
593         return (char *)ERROR_PTR("seps not defined", procName, NULL);
594     if (!psaveptr)
595         return (char *)ERROR_PTR("&saveptr not defined", procName, NULL);
596 
597     if (!cstr) {
598         start = *psaveptr;
599     } else {
600         start = cstr;
601         *psaveptr = NULL;
602     }
603     if (!start)  /* nothing to do */
604         return NULL;
605 
606         /* First time, scan for the first non-sep character */
607     istart = 0;
608     if (cstr) {
609         for (istart = 0;; istart++) {
610             if ((nextc = start[istart]) == '\0') {
611                 *psaveptr = NULL;  /* in case caller doesn't check ret value */
612                 return NULL;
613             }
614             if (!strchr(seps, nextc))
615                 break;
616         }
617     }
618 
619         /* Scan through, looking for a sep character; if none is
620          * found, 'i' will be at the end of the string. */
621     for (i = istart;; i++) {
622         if ((nextc = start[i]) == '\0')
623             break;
624         if (strchr(seps, nextc))
625             break;
626     }
627 
628         /* Save the substring */
629     nchars = i - istart;
630     substr = (char *)LEPT_CALLOC(nchars + 1, sizeof(char));
631     stringCopy(substr, start + istart, nchars);
632 
633         /* Look for the next non-sep character.
634          * If this is the last substring, return a null saveptr. */
635     for (j = i;; j++) {
636         if ((nextc = start[j]) == '\0') {
637             *psaveptr = NULL;  /* no more non-sep characters */
638             break;
639         }
640         if (!strchr(seps, nextc)) {
641             *psaveptr = start + j;  /* start here on next call */
642                 break;
643         }
644     }
645 
646     return substr;
647 }
648 
649 
650 /*!
651  * \brief   stringSplitOnToken()
652  *
653  * \param[in]    cstr input string to be split; not altered
654  * \param[in]    seps a string of character separators
655  * \param[out]   phead ptr to copy of the input string, up to
656  *                     the first separator token encountered
657  * \param[out]   ptail ptr to copy of the part of the input string
658  *                     starting with the first non-separator character
659  *                     that occurs after the first separator is found
660  * \return  0 if OK, 1 on error
661  *
662  * <pre>
663  * Notes:
664  *      (1) The input string is not altered; all split parts are new strings.
665  *      (2) The split occurs around the first consecutive sequence of
666  *          tokens encountered.
667  *      (3) The head goes from the beginning of the string up to
668  *          but not including the first token found.
669  *      (4) The tail contains the second part of the string, starting
670  *          with the first char in that part that is NOT a token.
671  *      (5) If no separator token is found, 'head' contains a copy
672  *          of the input string and 'tail' is null.
673  * </pre>
674  */
675 l_int32
stringSplitOnToken(char * cstr,const char * seps,char ** phead,char ** ptail)676 stringSplitOnToken(char        *cstr,
677                    const char  *seps,
678                    char       **phead,
679                    char       **ptail)
680 {
681 char  *saveptr;
682 
683     PROCNAME("stringSplitOnToken");
684 
685     if (!phead)
686         return ERROR_INT("&head not defined", procName, 1);
687     if (!ptail)
688         return ERROR_INT("&tail not defined", procName, 1);
689     *phead = *ptail = NULL;
690     if (!cstr)
691         return ERROR_INT("cstr not defined", procName, 1);
692     if (!seps)
693         return ERROR_INT("seps not defined", procName, 1);
694 
695     *phead = strtokSafe(cstr, seps, &saveptr);
696     if (saveptr)
697         *ptail = stringNew(saveptr);
698     return 0;
699 }
700 
701 
702 /*--------------------------------------------------------------------*
703  *                       Find and replace procs                       *
704  *--------------------------------------------------------------------*/
705 /*!
706  * \brief   stringCheckForChars()
707  *
708  * \param[in]    src      input string; can be of zero length
709  * \param[in]    chars    string of chars to be searched for in %src
710  * \param[out]   pfound   1 if any characters are found; 0 otherwise
711  * \return  0 if OK, 1 on error
712  *
713  * <pre>
714  * Notes:
715  *      (1) This can be used to sanitize an operation by checking for
716  *          special characters that don't belong in a string.
717  * </pre>
718  */
719 l_int32
stringCheckForChars(const char * src,const char * chars,l_int32 * pfound)720 stringCheckForChars(const char  *src,
721                     const char  *chars,
722                     l_int32     *pfound)
723 {
724 char     ch;
725 l_int32  i, n;
726 
727     PROCNAME("stringCheckForChars");
728 
729     if (!pfound)
730         return ERROR_INT("&found not defined", procName, 1);
731     *pfound = FALSE;
732     if (!src || !chars)
733         return ERROR_INT("src and chars not both defined", procName, 1);
734 
735     n = strlen(src);
736     for (i = 0; i < n; i++) {
737         ch = src[i];
738         if (strchr(chars, ch)) {
739             *pfound = TRUE;
740             break;
741         }
742     }
743     return 0;
744 }
745 
746 
747 /*!
748  * \brief   stringRemoveChars()
749  *
750  * \param[in]    src input string; can be of zero length
751  * \param[in]    remchars  string of chars to be removed from src
752  * \return  dest string with specified chars removed, or NULL on error
753  */
754 char *
stringRemoveChars(const char * src,const char * remchars)755 stringRemoveChars(const char  *src,
756                   const char  *remchars)
757 {
758 char     ch;
759 char    *dest;
760 l_int32  nsrc, i, k;
761 
762     PROCNAME("stringRemoveChars");
763 
764     if (!src)
765         return (char *)ERROR_PTR("src not defined", procName, NULL);
766     if (!remchars)
767         return stringNew(src);
768 
769     if ((dest = (char *)LEPT_CALLOC(strlen(src) + 1, sizeof(char))) == NULL)
770         return (char *)ERROR_PTR("dest not made", procName, NULL);
771     nsrc = strlen(src);
772     for (i = 0, k = 0; i < nsrc; i++) {
773         ch = src[i];
774         if (!strchr(remchars, ch))
775             dest[k++] = ch;
776     }
777 
778     return dest;
779 }
780 
781 
782 /*!
783  * \brief   stringFindSubstr()
784  *
785  * \param[in]    src input string; can be of zero length
786  * \param[in]    sub substring to be searched for
787  * \param[out]   ploc [optional] location of substring in src
788  * \return  1 if found; 0 if not found or on error
789  *
790  * <pre>
791  * Notes:
792  *      (1) This is a wrapper around strstr().
793  *      (2) Both %src and %sub must be defined, and %sub must have
794  *          length of at least 1.
795  *      (3) If the substring is not found and loc is returned, it has
796  *          the value -1.
797  * </pre>
798  */
799 l_int32
stringFindSubstr(const char * src,const char * sub,l_int32 * ploc)800 stringFindSubstr(const char  *src,
801                  const char  *sub,
802                  l_int32     *ploc)
803 {
804 char  *ptr;
805 
806     PROCNAME("stringFindSubstr");
807 
808     if (!src)
809         return ERROR_INT("src not defined", procName, 0);
810     if (!sub)
811         return ERROR_INT("sub not defined", procName, 0);
812     if (ploc) *ploc = -1;
813     if (strlen(sub) == 0)
814         return ERROR_INT("substring length 0", procName, 0);
815     if (strlen(src) == 0)
816         return 0;
817 
818     if ((ptr = (char *)strstr(src, sub)) == NULL)  /* not found */
819         return 0;
820 
821     if (ploc)
822         *ploc = ptr - src;
823     return 1;
824 }
825 
826 
827 /*!
828  * \brief   stringReplaceSubstr()
829  *
830  * \param[in]    src input string; can be of zero length
831  * \param[in]    sub1 substring to be replaced
832  * \param[in]    sub2 substring to put in; can be ""
833  * \param[out]   pfound [optional] 1 if sub1 is found; 0 otherwise
834  * \param[out]   ploc [optional] location of ptr after replacement
835  * \return  dest string with substring replaced, or NULL if the
836  *              substring not found or on error.
837  *
838  * <pre>
839  * Notes:
840  *      (1) Replaces the first instance.
841  *      (2) To only remove sub1, use "" for sub2
842  *      (3) Returns a new string if sub1 and sub2 are the same.
843  *      (4) The optional loc is input as the byte offset within the src
844  *          from which the search starts, and after the search it is the
845  *          char position in the string of the next character after
846  *          the substituted string.
847  *      (5) N.B. If ploc is not null, loc must always be initialized.
848  *          To search the string from the beginning, set loc = 0.
849  * </pre>
850  */
851 char *
stringReplaceSubstr(const char * src,const char * sub1,const char * sub2,l_int32 * pfound,l_int32 * ploc)852 stringReplaceSubstr(const char  *src,
853                     const char  *sub1,
854                     const char  *sub2,
855                     l_int32     *pfound,
856                     l_int32     *ploc)
857 {
858 char    *ptr, *dest;
859 l_int32  nsrc, nsub1, nsub2, len, npre, loc;
860 
861     PROCNAME("stringReplaceSubstr");
862 
863     if (!src)
864         return (char *)ERROR_PTR("src not defined", procName, NULL);
865     if (!sub1)
866         return (char *)ERROR_PTR("sub1 not defined", procName, NULL);
867     if (!sub2)
868         return (char *)ERROR_PTR("sub2 not defined", procName, NULL);
869 
870     if (pfound)
871         *pfound = 0;
872     if (ploc)
873         loc = *ploc;
874     else
875         loc = 0;
876     if ((ptr = (char *)strstr(src + loc, sub1)) == NULL) {
877         return NULL;
878     }
879 
880     if (pfound)
881         *pfound = 1;
882     nsrc = strlen(src);
883     nsub1 = strlen(sub1);
884     nsub2 = strlen(sub2);
885     len = nsrc + nsub2 - nsub1;
886     if ((dest = (char *)LEPT_CALLOC(len + 1, sizeof(char))) == NULL)
887         return (char *)ERROR_PTR("dest not made", procName, NULL);
888     npre = ptr - src;
889     memcpy(dest, src, npre);
890     strcpy(dest + npre, sub2);
891     strcpy(dest + npre + nsub2, ptr + nsub1);
892     if (ploc)
893         *ploc = npre + nsub2;
894 
895     return dest;
896 }
897 
898 
899 /*!
900  * \brief   stringReplaceEachSubstr()
901  *
902  * \param[in]    src input string; can be of zero length
903  * \param[in]    sub1 substring to be replaced
904  * \param[in]    sub2 substring to put in; can be ""
905  * \param[out]   pcount [optional] the number of times that sub1
906  *                      is found in src; 0 if not found
907  * \return  dest string with substring replaced, or NULL if the
908  *              substring not found or on error.
909  *
910  * <pre>
911  * Notes:
912  *      (1) Replaces every instance.
913  *      (2) To only remove each instance of sub1, use "" for sub2
914  *      (3) Returns NULL if sub1 and sub2 are the same.
915  * </pre>
916  */
917 char *
stringReplaceEachSubstr(const char * src,const char * sub1,const char * sub2,l_int32 * pcount)918 stringReplaceEachSubstr(const char  *src,
919                         const char  *sub1,
920                         const char  *sub2,
921                         l_int32     *pcount)
922 {
923 char    *currstr, *newstr;
924 l_int32  loc;
925 
926     PROCNAME("stringReplaceEachSubstr");
927 
928     if (pcount) *pcount = 0;
929     if (!src)
930         return (char *)ERROR_PTR("src not defined", procName, NULL);
931     if (!sub1)
932         return (char *)ERROR_PTR("sub1 not defined", procName, NULL);
933     if (!sub2)
934         return (char *)ERROR_PTR("sub2 not defined", procName, NULL);
935 
936     loc = 0;
937     if ((newstr = stringReplaceSubstr(src, sub1, sub2, NULL, &loc)) == NULL)
938         return NULL;
939 
940     if (pcount)
941         (*pcount)++;
942     while (1) {
943         currstr = newstr;
944         newstr = stringReplaceSubstr(currstr, sub1, sub2, NULL, &loc);
945         if (!newstr)
946             return currstr;
947         LEPT_FREE(currstr);
948         if (pcount)
949             (*pcount)++;
950     }
951 }
952 
953 
954 /*!
955  * \brief   arrayFindEachSequence()
956  *
957  * \param[in]    data byte array
958  * \param[in]    datalen length of data, in bytes
959  * \param[in]    sequence subarray of bytes to find in data
960  * \param[in]    seqlen length of sequence, in bytes
961  * \return  dna of offsets where the sequence is found, or NULL if
962  *              none are found or on error
963  *
964  * <pre>
965  * Notes:
966  *      (1) The byte arrays %data and %sequence are not C strings,
967  *          as they can contain null bytes.  Therefore, for each
968  *          we must give the length of the array.
969  *      (2) This finds every occurrence in %data of %sequence.
970  * </pre>
971  */
972 L_DNA *
arrayFindEachSequence(const l_uint8 * data,size_t datalen,const l_uint8 * sequence,size_t seqlen)973 arrayFindEachSequence(const l_uint8  *data,
974                       size_t          datalen,
975                       const l_uint8  *sequence,
976                       size_t          seqlen)
977 {
978 l_int32  start, offset, realoffset, found;
979 L_DNA   *da;
980 
981     PROCNAME("arrayFindEachSequence");
982 
983     if (!data || !sequence)
984         return (L_DNA *)ERROR_PTR("data & sequence not both defined",
985                                   procName, NULL);
986 
987     da = l_dnaCreate(0);
988     start = 0;
989     while (1) {
990         arrayFindSequence(data + start, datalen - start, sequence, seqlen,
991                           &offset, &found);
992         if (found == FALSE)
993             break;
994 
995         realoffset = start + offset;
996         l_dnaAddNumber(da, realoffset);
997         start = realoffset + seqlen;
998         if (start >= datalen)
999             break;
1000     }
1001 
1002     if (l_dnaGetCount(da) == 0)
1003         l_dnaDestroy(&da);
1004     return da;
1005 }
1006 
1007 
1008 /*!
1009  * \brief   arrayFindSequence()
1010  *
1011  * \param[in]    data byte array
1012  * \param[in]    datalen length of data, in bytes
1013  * \param[in]    sequence subarray of bytes to find in data
1014  * \param[in]    seqlen length of sequence, in bytes
1015  * \param[out]   poffset offset from beginning of
1016  *                       data where the sequence begins
1017  * \param[out]   pfound 1 if sequence is found; 0 otherwise
1018  * \return  0 if OK, 1 on error
1019  *
1020  * <pre>
1021  * Notes:
1022  *      (1) The byte arrays 'data' and 'sequence' are not C strings,
1023  *          as they can contain null bytes.  Therefore, for each
1024  *          we must give the length of the array.
1025  *      (2) This searches for the first occurrence in %data of %sequence,
1026  *          which consists of %seqlen bytes.  The parameter %seqlen
1027  *          must not exceed the actual length of the %sequence byte array.
1028  *      (3) If the sequence is not found, the offset will be 0, so you
1029  *          must check %found.
1030  * </pre>
1031  */
1032 l_int32
arrayFindSequence(const l_uint8 * data,size_t datalen,const l_uint8 * sequence,size_t seqlen,l_int32 * poffset,l_int32 * pfound)1033 arrayFindSequence(const l_uint8  *data,
1034                   size_t          datalen,
1035                   const l_uint8  *sequence,
1036                   size_t          seqlen,
1037                   l_int32        *poffset,
1038                   l_int32        *pfound)
1039 {
1040 l_int32  i, j, found, lastpos;
1041 
1042     PROCNAME("arrayFindSequence");
1043 
1044     if (poffset) *poffset = 0;
1045     if (pfound) *pfound = FALSE;
1046     if (!data || !sequence)
1047         return ERROR_INT("data & sequence not both defined", procName, 1);
1048     if (!poffset || !pfound)
1049         return ERROR_INT("&offset and &found not defined", procName, 1);
1050 
1051     lastpos = datalen - seqlen + 1;
1052     found = FALSE;
1053     for (i = 0; i < lastpos; i++) {
1054         for (j = 0; j < seqlen; j++) {
1055             if (data[i + j] != sequence[j])
1056                  break;
1057             if (j == seqlen - 1)
1058                  found = TRUE;
1059         }
1060         if (found == TRUE)
1061             break;
1062     }
1063 
1064     if (found == TRUE) {
1065         *poffset = i;
1066         *pfound = TRUE;
1067     }
1068     return 0;
1069 }
1070 
1071 
1072 /*--------------------------------------------------------------------*
1073  *                             Safe realloc                           *
1074  *--------------------------------------------------------------------*/
1075 /*!
1076  * \brief   reallocNew()
1077  *
1078  * \param[in,out]  pindata [optional]; nulls indata
1079  * \param[in]      oldsize size of input data to be copied, in bytes
1080  * \param[in]      newsize size of data to be reallocated in bytes
1081  * \return  ptr to new data, or NULL on error
1082  *
1083  *  Action: !N.B. 3) and (4!
1084  *      1 Allocates memory, initialized to 0
1085  *      2 Copies as much of the input data as possible
1086  *          to the new block, truncating the copy if necessary
1087  *      3 Frees the input data
1088  *      4 Zeroes the input data ptr
1089  *
1090  * <pre>
1091  * Notes:
1092  *      (1) If newsize <=0, just frees input data and nulls ptr
1093  *      (2) If input ptr is null, just callocs new memory
1094  *      (3) This differs from realloc in that it always allocates
1095  *          new memory (if newsize > 0) and initializes it to 0,
1096  *          it requires the amount of old data to be copied,
1097  *          and it takes the address of the input ptr and
1098  *          nulls the handle.
1099  * </pre>
1100  */
1101 void *
reallocNew(void ** pindata,l_int32 oldsize,l_int32 newsize)1102 reallocNew(void   **pindata,
1103            l_int32  oldsize,
1104            l_int32  newsize)
1105 {
1106 l_int32  minsize;
1107 void    *indata;
1108 void    *newdata;
1109 
1110     PROCNAME("reallocNew");
1111 
1112     if (!pindata)
1113         return ERROR_PTR("input data not defined", procName, NULL);
1114     indata = *pindata;
1115 
1116     if (newsize <= 0) {   /* nonstandard usage */
1117         if (indata) {
1118             LEPT_FREE(indata);
1119             *pindata = NULL;
1120         }
1121         return NULL;
1122     }
1123 
1124     if (!indata) {  /* nonstandard usage */
1125         if ((newdata = (void *)LEPT_CALLOC(1, newsize)) == NULL)
1126             return ERROR_PTR("newdata not made", procName, NULL);
1127         return newdata;
1128     }
1129 
1130         /* Standard usage */
1131     if ((newdata = (void *)LEPT_CALLOC(1, newsize)) == NULL)
1132         return ERROR_PTR("newdata not made", procName, NULL);
1133     minsize = L_MIN(oldsize, newsize);
1134     memcpy((char *)newdata, (char *)indata, minsize);
1135 
1136     LEPT_FREE(indata);
1137     *pindata = NULL;
1138 
1139     return newdata;
1140 }
1141 
1142 
1143 /*--------------------------------------------------------------------*
1144  *                 Read and write between file and memory             *
1145  *--------------------------------------------------------------------*/
1146 /*!
1147  * \brief   l_binaryRead()
1148  *
1149  * \param[in]    filename
1150  * \param[out]   pnbytes number of bytes read
1151  * \return  data, or NULL on error
1152  */
1153 l_uint8 *
l_binaryRead(const char * filename,size_t * pnbytes)1154 l_binaryRead(const char  *filename,
1155              size_t      *pnbytes)
1156 {
1157 l_uint8  *data;
1158 FILE     *fp;
1159 
1160     PROCNAME("l_binaryRead");
1161 
1162     if (!pnbytes)
1163         return (l_uint8 *)ERROR_PTR("pnbytes not defined", procName, NULL);
1164     *pnbytes = 0;
1165     if (!filename)
1166         return (l_uint8 *)ERROR_PTR("filename not defined", procName, NULL);
1167 
1168     if ((fp = fopenReadStream(filename)) == NULL)
1169         return (l_uint8 *)ERROR_PTR("file stream not opened", procName, NULL);
1170     data = l_binaryReadStream(fp, pnbytes);
1171     fclose(fp);
1172     return data;
1173 }
1174 
1175 
1176 /*!
1177  * \brief   l_binaryReadStream()
1178  *
1179  * \param[in]    fp file stream opened to read; can be stdin
1180  * \param[out]   pnbytes number of bytes read
1181  * \return  null-terminated array, or NULL on error
1182  *              reading 0 bytes is not an error
1183  *
1184  * <pre>
1185  * Notes:
1186  *      (1) The returned array is terminated with a null byte so that it can
1187  *          be used to read ascii data from a file into a proper C string.
1188  *      (2) This can be used to capture data that is piped in via stdin,
1189  *          because it does not require seeking within the file.
1190  *      (3) For example, you can read an image from stdin into memory
1191  *          using shell redirection, with one of these shell commands:
1192  *             cat <imagefile> | readprog
1193  *             readprog < <imagefile>
1194  *          where readprog is:
1195  *             l_uint8 *data = l_binaryReadStream(stdin, &nbytes);
1196  *             Pix *pix = pixReadMem(data, nbytes);
1197  * </pre>
1198  */
1199 l_uint8 *
l_binaryReadStream(FILE * fp,size_t * pnbytes)1200 l_binaryReadStream(FILE    *fp,
1201                    size_t  *pnbytes)
1202 {
1203 l_uint8    *data;
1204 l_int32     seekable, navail, nadd, nread;
1205 L_BBUFFER  *bb;
1206 
1207     PROCNAME("l_binaryReadStream");
1208 
1209     if (!pnbytes)
1210         return (l_uint8 *)ERROR_PTR("&nbytes not defined", procName, NULL);
1211     *pnbytes = 0;
1212     if (!fp)
1213         return (l_uint8 *)ERROR_PTR("fp not defined", procName, NULL);
1214 
1215         /* Test if the stream is seekable, by attempting to seek to
1216          * the start of data.  This is a no-op.  If it is seekable, use
1217          * l_binaryReadSelectStream() to determine the size of the
1218          * data to be read in advance. */
1219     seekable = (ftell(fp) == 0) ? 1 : 0;
1220     if (seekable)
1221         return l_binaryReadSelectStream(fp, 0, 0, pnbytes);
1222 
1223         /* If it is not seekable, use the bbuffer to realloc memory
1224          * as needed during reading. */
1225     bb = bbufferCreate(NULL, 4096);
1226     while (1) {
1227         navail = bb->nalloc - bb->n;
1228         if (navail < 4096) {
1229              nadd = L_MAX(bb->nalloc, 4096);
1230              bbufferExtendArray(bb, nadd);
1231         }
1232         nread = fread((void *)(bb->array + bb->n), 1, 4096, fp);
1233         bb->n += nread;
1234         if (nread != 4096) break;
1235     }
1236 
1237         /* Copy the data to a new array sized for the data, because
1238          * the bbuffer array can be nearly twice the size we need. */
1239     if ((data = (l_uint8 *)LEPT_CALLOC(bb->n + 1, sizeof(l_uint8))) != NULL) {
1240         memcpy(data, bb->array, bb->n);
1241         *pnbytes = bb->n;
1242     } else {
1243         L_ERROR("calloc fail for data\n", procName);
1244     }
1245 
1246     bbufferDestroy(&bb);
1247     return data;
1248 }
1249 
1250 
1251 /*!
1252  * \brief   l_binaryReadSelect()
1253  *
1254  * \param[in]    filename
1255  * \param[in]    start first byte to read
1256  * \param[in]    nbytes number of bytes to read; use 0 to read to end of file
1257  * \param[out]   pnread number of bytes actually read
1258  * \return  data, or NULL on error
1259  *
1260  * <pre>
1261  * Notes:
1262  *      (1) The returned array is terminated with a null byte so that it can
1263  *          be used to read ascii data from a file into a proper C string.
1264  * </pre>
1265  */
1266 l_uint8 *
l_binaryReadSelect(const char * filename,size_t start,size_t nbytes,size_t * pnread)1267 l_binaryReadSelect(const char  *filename,
1268                    size_t       start,
1269                    size_t       nbytes,
1270                    size_t      *pnread)
1271 {
1272 l_uint8  *data;
1273 FILE     *fp;
1274 
1275     PROCNAME("l_binaryReadSelect");
1276 
1277     if (!pnread)
1278         return (l_uint8 *)ERROR_PTR("pnread not defined", procName, NULL);
1279     *pnread = 0;
1280     if (!filename)
1281         return (l_uint8 *)ERROR_PTR("filename not defined", procName, NULL);
1282 
1283     if ((fp = fopenReadStream(filename)) == NULL)
1284         return (l_uint8 *)ERROR_PTR("file stream not opened", procName, NULL);
1285     data = l_binaryReadSelectStream(fp, start, nbytes, pnread);
1286     fclose(fp);
1287     return data;
1288 }
1289 
1290 
1291 /*!
1292  * \brief   l_binaryReadSelectStream()
1293  *
1294  * \param[in]    fp file stream
1295  * \param[in]    start first byte to read
1296  * \param[in]    nbytes number of bytes to read; use 0 to read to end of file
1297  * \param[out]   pnread number of bytes actually read
1298  * \return  null-terminated array, or NULL on error
1299  *              reading 0 bytes is not an error
1300  *
1301  * <pre>
1302  * Notes:
1303  *      (1) The returned array is terminated with a null byte so that it can
1304  *          be used to read ascii data from a file into a proper C string.
1305  *          If the file to be read is empty and %start == 0, an array
1306  *          with a single null byte is returned.
1307  *      (2) Side effect: the stream pointer is re-positioned to the
1308  *          beginning of the file.
1309  * </pre>
1310  */
1311 l_uint8 *
l_binaryReadSelectStream(FILE * fp,size_t start,size_t nbytes,size_t * pnread)1312 l_binaryReadSelectStream(FILE    *fp,
1313                          size_t   start,
1314                          size_t   nbytes,
1315                          size_t  *pnread)
1316 {
1317 l_uint8  *data;
1318 size_t    bytesleft, bytestoread, nread, filebytes;
1319 
1320     PROCNAME("l_binaryReadSelectStream");
1321 
1322     if (!pnread)
1323         return (l_uint8 *)ERROR_PTR("&nread not defined", procName, NULL);
1324     *pnread = 0;
1325     if (!fp)
1326         return (l_uint8 *)ERROR_PTR("stream not defined", procName, NULL);
1327 
1328         /* Verify and adjust the parameters if necessary */
1329     fseek(fp, 0, SEEK_END);  /* EOF */
1330     filebytes = ftell(fp);
1331     fseek(fp, 0, SEEK_SET);
1332     if (start > filebytes) {
1333         L_ERROR("start = %lu but filebytes = %lu\n", procName,
1334                 (unsigned long)start, (unsigned long)filebytes);
1335         return NULL;
1336     }
1337     if (filebytes == 0)  /* start == 0; nothing to read; return null byte */
1338         return (l_uint8 *)LEPT_CALLOC(1, 1);
1339     bytesleft = filebytes - start;  /* greater than 0 */
1340     if (nbytes == 0) nbytes = bytesleft;
1341     bytestoread = (bytesleft >= nbytes) ? nbytes : bytesleft;
1342 
1343         /* Read the data */
1344     if ((data = (l_uint8 *)LEPT_CALLOC(1, bytestoread + 1)) == NULL)
1345         return (l_uint8 *)ERROR_PTR("calloc fail for data", procName, NULL);
1346     fseek(fp, start, SEEK_SET);
1347     nread = fread(data, 1, bytestoread, fp);
1348     if (nbytes != nread)
1349         L_INFO("%lu bytes requested; %lu bytes read\n", procName,
1350                (unsigned long)nbytes, (unsigned long)nread);
1351     *pnread = nread;
1352     fseek(fp, 0, SEEK_SET);
1353     return data;
1354 }
1355 
1356 
1357 /*!
1358  * \brief   l_binaryWrite()
1359  *
1360  * \param[in]    filename output
1361  * \param[in]    operation  "w" for write; "a" for append
1362  * \param[in]    data  binary data to be written
1363  * \param[in]    nbytes  size of data array
1364  * \return  0 if OK; 1 on error
1365  */
1366 l_int32
l_binaryWrite(const char * filename,const char * operation,void * data,size_t nbytes)1367 l_binaryWrite(const char  *filename,
1368               const char  *operation,
1369               void        *data,
1370               size_t       nbytes)
1371 {
1372 char   actualOperation[20];
1373 FILE  *fp;
1374 
1375     PROCNAME("l_binaryWrite");
1376 
1377     if (!filename)
1378         return ERROR_INT("filename not defined", procName, 1);
1379     if (!operation)
1380         return ERROR_INT("operation not defined", procName, 1);
1381     if (!data)
1382         return ERROR_INT("data not defined", procName, 1);
1383     if (nbytes <= 0)
1384         return ERROR_INT("nbytes must be > 0", procName, 1);
1385 
1386     if (strcmp(operation, "w") && strcmp(operation, "a"))
1387         return ERROR_INT("operation not one of {'w','a'}", procName, 1);
1388 
1389         /* The 'b' flag to fopen() is ignored for all POSIX
1390          * conforming systems.  However, Windows needs the 'b' flag. */
1391     stringCopy(actualOperation, operation, 2);
1392     strncat(actualOperation, "b", 2);
1393 
1394     if ((fp = fopenWriteStream(filename, actualOperation)) == NULL)
1395         return ERROR_INT("stream not opened", procName, 1);
1396     fwrite(data, 1, nbytes, fp);
1397     fclose(fp);
1398     return 0;
1399 }
1400 
1401 
1402 /*!
1403  * \brief   nbytesInFile()
1404  *
1405  * \param[in]    filename
1406  * \return  nbytes in file; 0 on error
1407  */
1408 size_t
nbytesInFile(const char * filename)1409 nbytesInFile(const char  *filename)
1410 {
1411 size_t  nbytes;
1412 FILE   *fp;
1413 
1414     PROCNAME("nbytesInFile");
1415 
1416     if (!filename)
1417         return ERROR_INT("filename not defined", procName, 0);
1418     if ((fp = fopenReadStream(filename)) == NULL)
1419         return ERROR_INT("stream not opened", procName, 0);
1420     nbytes = fnbytesInFile(fp);
1421     fclose(fp);
1422     return nbytes;
1423 }
1424 
1425 
1426 /*!
1427  * \brief   fnbytesInFile()
1428  *
1429  * \param[in]    fp file stream
1430  * \return  nbytes in file; 0 on error
1431  */
1432 size_t
fnbytesInFile(FILE * fp)1433 fnbytesInFile(FILE  *fp)
1434 {
1435 l_int64  pos, nbytes;
1436 
1437     PROCNAME("fnbytesInFile");
1438 
1439     if (!fp)
1440         return ERROR_INT("stream not open", procName, 0);
1441 
1442     pos = ftell(fp);          /* initial position */
1443     fseek(fp, 0, SEEK_END);   /* EOF */
1444     nbytes = ftell(fp);
1445     fseek(fp, pos, SEEK_SET);        /* back to initial position */
1446     return nbytes;
1447 }
1448 
1449 
1450 /*--------------------------------------------------------------------*
1451  *                            Copy in memory                          *
1452  *--------------------------------------------------------------------*/
1453 /*!
1454  * \brief   l_binaryCopy()
1455  *
1456  * \param[in]    datas
1457  * \param[in]    size of data array
1458  * \return  datad on heap, or NULL on error
1459  *
1460  * <pre>
1461  * Notes:
1462  *      (1) We add 4 bytes to the zeroed output because in some cases
1463  *          (e.g., string handling) it is important to have the data
1464  *          be null terminated.  This guarantees that after the memcpy,
1465  *          the result is automatically null terminated.
1466  * </pre>
1467  */
1468 l_uint8 *
l_binaryCopy(l_uint8 * datas,size_t size)1469 l_binaryCopy(l_uint8  *datas,
1470              size_t    size)
1471 {
1472 l_uint8  *datad;
1473 
1474     PROCNAME("l_binaryCopy");
1475 
1476     if (!datas)
1477         return (l_uint8 *)ERROR_PTR("datas not defined", procName, NULL);
1478 
1479     if ((datad = (l_uint8 *)LEPT_CALLOC(size + 4, sizeof(l_uint8))) == NULL)
1480         return (l_uint8 *)ERROR_PTR("datad not made", procName, NULL);
1481     memcpy(datad, datas, size);
1482     return datad;
1483 }
1484 
1485 
1486 /*--------------------------------------------------------------------*
1487  *                         File copy operations                       *
1488  *--------------------------------------------------------------------*/
1489 /*!
1490  * \brief   fileCopy()
1491  *
1492  * \param[in]    srcfile copy this file
1493  * \param[in]    newfile to this file
1494  * \return  0 if OK, 1 on error
1495  */
1496 l_int32
fileCopy(const char * srcfile,const char * newfile)1497 fileCopy(const char  *srcfile,
1498          const char  *newfile)
1499 {
1500 l_int32   ret;
1501 size_t    nbytes;
1502 l_uint8  *data;
1503 
1504     PROCNAME("fileCopy");
1505 
1506     if (!srcfile)
1507         return ERROR_INT("srcfile not defined", procName, 1);
1508     if (!newfile)
1509         return ERROR_INT("newfile not defined", procName, 1);
1510 
1511     if ((data = l_binaryRead(srcfile, &nbytes)) == NULL)
1512         return ERROR_INT("data not returned", procName, 1);
1513     ret = l_binaryWrite(newfile, "w", data, nbytes);
1514     LEPT_FREE(data);
1515     return ret;
1516 }
1517 
1518 
1519 /*!
1520  * \brief   fileConcatenate()
1521  *
1522  * \param[in]    srcfile file to append
1523  * \param[in]    destfile file to add to
1524  * \return  0 if OK, 1 on error
1525  */
1526 l_int32
fileConcatenate(const char * srcfile,const char * destfile)1527 fileConcatenate(const char  *srcfile,
1528                 const char  *destfile)
1529 {
1530 size_t    nbytes;
1531 l_uint8  *data;
1532 
1533     PROCNAME("fileConcatenate");
1534 
1535     if (!srcfile)
1536         return ERROR_INT("srcfile not defined", procName, 1);
1537     if (!destfile)
1538         return ERROR_INT("destfile not defined", procName, 1);
1539 
1540     data = l_binaryRead(srcfile, &nbytes);
1541     l_binaryWrite(destfile, "a", data, nbytes);
1542     LEPT_FREE(data);
1543     return 0;
1544 }
1545 
1546 
1547 /*!
1548  * \brief   fileAppendString()
1549  *
1550  * \param[in]    filename
1551  * \param[in]    str string to append to file
1552  * \return  0 if OK, 1 on error
1553  */
1554 l_int32
fileAppendString(const char * filename,const char * str)1555 fileAppendString(const char  *filename,
1556                  const char  *str)
1557 {
1558 FILE  *fp;
1559 
1560     PROCNAME("fileAppendString");
1561 
1562     if (!filename)
1563         return ERROR_INT("filename not defined", procName, 1);
1564     if (!str)
1565         return ERROR_INT("str not defined", procName, 1);
1566 
1567     if ((fp = fopenWriteStream(filename, "a")) == NULL)
1568         return ERROR_INT("stream not opened", procName, 1);
1569     fprintf(fp, "%s", str);
1570     fclose(fp);
1571     return 0;
1572 }
1573 
1574 
1575 /*--------------------------------------------------------------------*
1576  *          Multi-platform functions for opening file streams         *
1577  *--------------------------------------------------------------------*/
1578 /*!
1579  * \brief   fopenReadStream()
1580  *
1581  * \param[in]    filename
1582  * \return  stream, or NULL on error
1583  *
1584  * <pre>
1585  * Notes:
1586  *      (1) This should be used whenever you want to run fopen() to
1587  *          read from a stream.  Never call fopen() directory.
1588  *      (2) This handles the temp directory pathname conversion on windows:
1589  *              /tmp  ==>  <Windows Temp directory>
1590  * </pre>
1591  */
1592 FILE *
fopenReadStream(const char * filename)1593 fopenReadStream(const char  *filename)
1594 {
1595 char  *fname, *tail;
1596 FILE  *fp;
1597 
1598     PROCNAME("fopenReadStream");
1599 
1600     if (!filename)
1601         return (FILE *)ERROR_PTR("filename not defined", procName, NULL);
1602 
1603         /* Try input filename */
1604     fname = genPathname(filename, NULL);
1605     fp = fopen(fname, "rb");
1606     LEPT_FREE(fname);
1607     if (fp) return fp;
1608 
1609         /* Else, strip directory and try locally */
1610     splitPathAtDirectory(filename, NULL, &tail);
1611     fp = fopen(tail, "rb");
1612     LEPT_FREE(tail);
1613 
1614     if (!fp)
1615         return (FILE *)ERROR_PTR("file not found", procName, NULL);
1616     return fp;
1617 }
1618 
1619 
1620 /*!
1621  * \brief   fopenWriteStream()
1622  *
1623  * \param[in]    filename
1624  * \param[in]    modestring
1625  * \return  stream, or NULL on error
1626  *
1627  * <pre>
1628  * Notes:
1629  *      (1) This should be used whenever you want to run fopen() to
1630  *          write or append to a stream.  Never call fopen() directory.
1631  *      (2) This handles the temp directory pathname conversion on windows:
1632  *              /tmp  ==>  <Windows Temp directory>
1633  * </pre>
1634  */
1635 FILE *
fopenWriteStream(const char * filename,const char * modestring)1636 fopenWriteStream(const char  *filename,
1637                  const char  *modestring)
1638 {
1639 char  *fname;
1640 FILE  *fp;
1641 
1642     PROCNAME("fopenWriteStream");
1643 
1644     if (!filename)
1645         return (FILE *)ERROR_PTR("filename not defined", procName, NULL);
1646 
1647     fname = genPathname(filename, NULL);
1648     fp = fopen(fname, modestring);
1649     LEPT_FREE(fname);
1650     if (!fp)
1651         return (FILE *)ERROR_PTR("stream not opened", procName, NULL);
1652     return fp;
1653 }
1654 
1655 
1656 /*!
1657  * \brief   fopenReadFromMemory()
1658  *
1659  * \param[in]    data, size
1660  * \return  file stream, or NULL on error
1661  *
1662  * <pre>
1663  * Notes:
1664  *      (1) Work-around if fmemopen() not available.
1665  *      (2) Windows tmpfile() writes into the root C:\ directory, which
1666  *          requires admin privileges.  This also works around that.
1667  * </pre>
1668  */
1669 FILE *
fopenReadFromMemory(const l_uint8 * data,size_t size)1670 fopenReadFromMemory(const l_uint8  *data,
1671                     size_t          size)
1672 {
1673 FILE  *fp;
1674 
1675     PROCNAME("fopenReadFromMemory");
1676 
1677     if (!data)
1678         return (FILE *)ERROR_PTR("data not defined", procName, NULL);
1679 
1680 #if HAVE_FMEMOPEN
1681     if ((fp = fmemopen((void *)data, size, "rb")) == NULL)
1682         return (FILE *)ERROR_PTR("stream not opened", procName, NULL);
1683 #else  /* write to tmp file */
1684     L_INFO("work-around: writing to a temp file\n", procName);
1685   #ifdef _WIN32
1686     if ((fp = fopenWriteWinTempfile()) == NULL)
1687         return (FILE *)ERROR_PTR("tmpfile stream not opened", procName, NULL);
1688   #else
1689     if ((fp = tmpfile()) == NULL)
1690         return (FILE *)ERROR_PTR("tmpfile stream not opened", procName, NULL);
1691   #endif  /*  _WIN32 */
1692     fwrite(data, 1, size, fp);
1693     rewind(fp);
1694 #endif  /* HAVE_FMEMOPEN */
1695 
1696     return fp;
1697 }
1698 
1699 
1700 /*--------------------------------------------------------------------*
1701  *                Opening a windows tmpfile for writing               *
1702  *--------------------------------------------------------------------*/
1703 /*!
1704  * \brief   fopenWriteWinTempfile()
1705  *
1706  * \return  file stream, or NULL on error
1707  *
1708  * <pre>
1709  * Notes:
1710  *      (1) The Windows version of tmpfile() writes into the root
1711  *          C:\ directory, which requires admin privileges.  This
1712  *          function provides an alternative implementation.
1713  * </pre>
1714  */
1715 FILE *
fopenWriteWinTempfile()1716 fopenWriteWinTempfile()
1717 {
1718 #ifdef _WIN32
1719 l_int32  handle;
1720 FILE    *fp;
1721 char    *filename;
1722 
1723     PROCNAME("fopenWriteWinTempfile");
1724 
1725     if ((filename = l_makeTempFilename()) == NULL) {
1726         L_ERROR("l_makeTempFilename failed, %s\n", procName, strerror(errno));
1727         return NULL;
1728     }
1729 
1730     handle = _open(filename, _O_CREAT | _O_RDWR | _O_SHORT_LIVED |
1731                    _O_TEMPORARY | _O_BINARY, _S_IREAD | _S_IWRITE);
1732     lept_free(filename);
1733     if (handle == -1) {
1734         L_ERROR("_open failed, %s\n", procName, strerror(errno));
1735         return NULL;
1736     }
1737 
1738     if ((fp = _fdopen(handle, "r+b")) == NULL) {
1739         L_ERROR("_fdopen failed, %s\n", procName, strerror(errno));
1740         return NULL;
1741     }
1742 
1743     return fp;
1744 #else
1745     return NULL;
1746 #endif  /*  _WIN32 */
1747 }
1748 
1749 
1750 /*--------------------------------------------------------------------*
1751  *       Multi-platform functions that avoid C-runtime boundary       *
1752  *             crossing for applications with Windows DLLs            *
1753  *--------------------------------------------------------------------*/
1754 /*
1755  *  Problems arise when pointers to streams and data are passed
1756  *  between two Windows DLLs that have been generated with different
1757  *  C runtimes.  To avoid this, leptonica provides wrappers for
1758  *  several C library calls.
1759  */
1760 /*!
1761  * \brief   lept_fopen()
1762  *
1763  * \param[in]    filename
1764  * \param[in]    mode same as for fopen(); e.g., "rb"
1765  * \return  stream or NULL on error
1766  *
1767  * <pre>
1768  * Notes:
1769  *      (1) This must be used by any application that passes
1770  *          a file handle to a leptonica Windows DLL.
1771  * </pre>
1772  */
1773 FILE *
lept_fopen(const char * filename,const char * mode)1774 lept_fopen(const char  *filename,
1775            const char  *mode)
1776 {
1777     PROCNAME("lept_fopen");
1778 
1779     if (!filename)
1780         return (FILE *)ERROR_PTR("filename not defined", procName, NULL);
1781     if (!mode)
1782         return (FILE *)ERROR_PTR("mode not defined", procName, NULL);
1783 
1784     if (stringFindSubstr(mode, "r", NULL))
1785         return fopenReadStream(filename);
1786     else
1787         return fopenWriteStream(filename, mode);
1788 }
1789 
1790 
1791 /*!
1792  * \brief   lept_fclose()
1793  *
1794  * \param[in]    fp file stream
1795  * \return  0 if OK, 1 on error
1796  *
1797  * <pre>
1798  * Notes:
1799  *      (1) This should be used by any application that accepts
1800  *          a file handle generated by a leptonica Windows DLL.
1801  * </pre>
1802  */
1803 l_int32
lept_fclose(FILE * fp)1804 lept_fclose(FILE *fp)
1805 {
1806     PROCNAME("lept_fclose");
1807 
1808     if (!fp)
1809         return ERROR_INT("stream not defined", procName, 1);
1810 
1811     return fclose(fp);
1812 }
1813 
1814 
1815 /*!
1816  * \brief   lept_calloc()
1817  *
1818  * \param[in]    nmemb number of members
1819  * \param[in]    size of each member
1820  * \return  void ptr, or NULL on error
1821  *
1822  * <pre>
1823  * Notes:
1824  *      (1) For safety with windows DLLs, this can be used in conjunction
1825  *          with lept_free() to avoid C-runtime boundary problems.
1826  *          Just use these two functions throughout your application.
1827  * </pre>
1828  */
1829 void *
lept_calloc(size_t nmemb,size_t size)1830 lept_calloc(size_t  nmemb,
1831             size_t  size)
1832 {
1833     if (nmemb <= 0 || size <= 0)
1834         return NULL;
1835     return LEPT_CALLOC(nmemb, size);
1836 }
1837 
1838 
1839 /*!
1840  * \brief   lept_free()
1841  *
1842  * \param[in]    ptr
1843  *
1844  * <pre>
1845  * Notes:
1846  *      (1) This should be used by any application that accepts
1847  *          heap data allocated by a leptonica Windows DLL.
1848  * </pre>
1849  */
1850 void
lept_free(void * ptr)1851 lept_free(void *ptr)
1852 {
1853     if (!ptr) return;
1854     LEPT_FREE(ptr);
1855     return;
1856 }
1857 
1858 
1859 /*--------------------------------------------------------------------*
1860  *                Multi-platform file system operations               *
1861  *         [ These only write to /tmp or its subdirectories ]         *
1862  *--------------------------------------------------------------------*/
1863 /*!
1864  * \brief   lept_mkdir()
1865  *
1866  * \param[in]    subdir of /tmp or its equivalent on Windows
1867  * \return  0 on success, non-zero on failure
1868  *
1869  * <pre>
1870  * Notes:
1871  *      (1) %subdir is a partial path that can consist of one or more
1872  *          directories.
1873  *      (2) This makes any subdirectories of /tmp that are required.
1874  *      (3) The root temp directory is:
1875  *            /tmp    (unix)  [default]
1876  *            <Temp>  (windows)
1877  * </pre>
1878  */
1879 l_int32
lept_mkdir(const char * subdir)1880 lept_mkdir(const char  *subdir)
1881 {
1882 char     *dir, *tmpdir;
1883 l_int32   i, n;
1884 l_int32   ret = 0;
1885 SARRAY   *sa;
1886 #ifdef  _WIN32
1887 l_uint32  attributes;
1888 #endif  /* _WIN32 */
1889 
1890     PROCNAME("lept_mkdir");
1891 
1892     if (!LeptDebugOK) {
1893         L_INFO("making named temp subdirectory %s is disabled\n",
1894                procName, subdir);
1895         return 0;
1896     }
1897 
1898     if (!subdir)
1899         return ERROR_INT("subdir not defined", procName, 1);
1900     if ((strlen(subdir) == 0) || (subdir[0] == '.') || (subdir[0] == '/'))
1901         return ERROR_INT("subdir not an actual subdirectory", procName, 1);
1902 
1903     sa = sarrayCreate(0);
1904     sarraySplitString(sa, subdir, "/");
1905     n = sarrayGetCount(sa);
1906     dir = genPathname("/tmp", NULL);
1907        /* Make sure the tmp directory exists */
1908 #ifndef _WIN32
1909     ret = mkdir(dir, 0777);
1910 #else
1911     attributes = GetFileAttributes(dir);
1912     if (attributes == INVALID_FILE_ATTRIBUTES)
1913         ret = (CreateDirectory(dir, NULL) ? 0 : 1);
1914 #endif
1915         /* Make all the subdirectories */
1916     for (i = 0; i < n; i++) {
1917         tmpdir = pathJoin(dir, sarrayGetString(sa, i, L_NOCOPY));
1918 #ifndef _WIN32
1919         ret += mkdir(tmpdir, 0777);
1920 #else
1921         if (CreateDirectory(tmpdir, NULL) == 0)
1922             ret += (GetLastError () != ERROR_ALREADY_EXISTS);
1923 #endif
1924         LEPT_FREE(dir);
1925         dir = tmpdir;
1926     }
1927     LEPT_FREE(dir);
1928     sarrayDestroy(&sa);
1929     if (ret > 0)
1930         L_ERROR("failure to create %d directories\n", procName, ret);
1931     return ret;
1932 }
1933 
1934 
1935 /*!
1936  * \brief   lept_rmdir()
1937  *
1938  * \param[in]    subdir of /tmp or its equivalent on Windows
1939  * \return  0 on success, non-zero on failure
1940  *
1941  * <pre>
1942  * Notes:
1943  *      (1) %subdir is a partial path that can consist of one or more
1944  *          directories.
1945  *      (2) This removes all files from the specified subdirectory of
1946  *          the root temp directory:
1947  *            /tmp    (unix)
1948  *            <Temp>  (windows)
1949  *          and then removes the subdirectory.
1950  *      (3) The combination
1951  *            lept_rmdir(subdir);
1952  *            lept_mkdir(subdir);
1953  *          is guaranteed to give you an empty subdirectory.
1954  * </pre>
1955  */
1956 l_int32
lept_rmdir(const char * subdir)1957 lept_rmdir(const char  *subdir)
1958 {
1959 char    *dir, *realdir, *fname, *fullname;
1960 l_int32  exists, ret, i, nfiles;
1961 SARRAY  *sa;
1962 #ifdef _WIN32
1963 char    *newpath;
1964 #endif  /* _WIN32 */
1965 
1966     PROCNAME("lept_rmdir");
1967 
1968     if (!subdir)
1969         return ERROR_INT("subdir not defined", procName, 1);
1970     if ((strlen(subdir) == 0) || (subdir[0] == '.') || (subdir[0] == '/'))
1971         return ERROR_INT("subdir not an actual subdirectory", procName, 1);
1972 
1973         /* Find the temp subdirectory */
1974     dir = pathJoin("/tmp", subdir);
1975     if (!dir)
1976         return ERROR_INT("directory name not made", procName, 1);
1977     lept_direxists(dir, &exists);
1978     if (!exists) {  /* fail silently */
1979         LEPT_FREE(dir);
1980         return 0;
1981     }
1982 
1983         /* List all the files in that directory */
1984     if ((sa = getFilenamesInDirectory(dir)) == NULL) {
1985         L_ERROR("directory %s does not exist!\n", procName, dir);
1986         LEPT_FREE(dir);
1987         return 1;
1988     }
1989     nfiles = sarrayGetCount(sa);
1990 
1991     for (i = 0; i < nfiles; i++) {
1992         fname = sarrayGetString(sa, i, L_NOCOPY);
1993         fullname = genPathname(dir, fname);
1994         remove(fullname);
1995         LEPT_FREE(fullname);
1996     }
1997 
1998 #ifndef _WIN32
1999     realdir = genPathname("/tmp", subdir);
2000     ret = rmdir(realdir);
2001     LEPT_FREE(realdir);
2002 #else
2003     newpath = genPathname(dir, NULL);
2004     ret = (RemoveDirectory(newpath) ? 0 : 1);
2005     LEPT_FREE(newpath);
2006 #endif  /* !_WIN32 */
2007 
2008     sarrayDestroy(&sa);
2009     LEPT_FREE(dir);
2010     return ret;
2011 }
2012 
2013 
2014 /*!
2015  * \brief   lept_direxists()
2016  *
2017  * \param[in]    dir
2018  * \param[out]   pexists 1 if it exists; 0 otherwise
2019  * \return  void
2020  *
2021  * <pre>
2022  * Notes:
2023  *      (1) Always use unix pathname separators.
2024  *      (2) By calling genPathname(), if the pathname begins with "/tmp"
2025  *          this does an automatic directory translation on windows
2026  *          to a path in the windows <Temp> directory:
2027  *             "/tmp"  ==>  <Temp> (windows)
2028  * </pre>
2029  */
2030 void
lept_direxists(const char * dir,l_int32 * pexists)2031 lept_direxists(const char  *dir,
2032                l_int32     *pexists)
2033 {
2034 char  *realdir;
2035 
2036     if (!pexists) return;
2037     *pexists = 0;
2038     if (!dir) return;
2039     if ((realdir = genPathname(dir, NULL)) == NULL)
2040         return;
2041 
2042 #ifndef _WIN32
2043     {
2044     struct stat s;
2045     l_int32 err = stat(realdir, &s);
2046     if (err != -1 && S_ISDIR(s.st_mode))
2047         *pexists = 1;
2048     }
2049 #else  /* _WIN32 */
2050     l_uint32  attributes;
2051     attributes = GetFileAttributes(realdir);
2052     if (attributes != INVALID_FILE_ATTRIBUTES &&
2053         (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
2054         *pexists = 1;
2055     }
2056 #endif  /* _WIN32 */
2057 
2058     LEPT_FREE(realdir);
2059     return;
2060 }
2061 
2062 
2063 /*!
2064  * \brief   lept_rm_match()
2065  *
2066  * \param[in]    subdir [optional]  If NULL, the removed files are in /tmp
2067  * \param[in]    substr [optional] pattern to match in filename
2068  * \return  0 on success, non-zero on failure
2069  *
2070  * <pre>
2071  * Notes:
2072  *      (1) This removes the matched files in /tmp or a subdirectory of /tmp.
2073  *          Use NULL for %subdir if the files are in /tmp.
2074  *      (2) If %substr == NULL, this removes all files in the directory.
2075  *          If %substr == "" (empty), this removes no files.
2076  *          If both %subdir == NULL and %substr == NULL, this removes
2077  *          all files in /tmp.
2078  *      (3) Use unix pathname separators.
2079  *      (4) By calling genPathname(), if the pathname begins with "/tmp"
2080  *          this does an automatic directory translation on windows
2081  *          to a path in the windows <Temp> directory:
2082  *             "/tmp"  ==>  <Temp> (windows)
2083  *      (5) Error conditions:
2084  *            * returns -1 if the directory is not found
2085  *            * returns the number of files (> 0) that it was unable to remove.
2086  * </pre>
2087  */
2088 l_int32
lept_rm_match(const char * subdir,const char * substr)2089 lept_rm_match(const char  *subdir,
2090               const char  *substr)
2091 {
2092 char    *path, *fname;
2093 char     tempdir[256];
2094 l_int32  i, n, ret;
2095 SARRAY  *sa;
2096 
2097     PROCNAME("lept_rm_match");
2098 
2099     makeTempDirname(tempdir, 256, subdir);
2100     if ((sa = getSortedPathnamesInDirectory(tempdir, substr, 0, 0)) == NULL)
2101         return ERROR_INT("sa not made", procName, -1);
2102     n = sarrayGetCount(sa);
2103     if (n == 0) {
2104         L_WARNING("no matching files found\n", procName);
2105         sarrayDestroy(&sa);
2106         return 0;
2107     }
2108 
2109     ret = 0;
2110     for (i = 0; i < n; i++) {
2111         fname = sarrayGetString(sa, i, L_NOCOPY);
2112         path = genPathname(fname, NULL);
2113         if (lept_rmfile(path) != 0) {
2114             L_ERROR("failed to remove %s\n", procName, path);
2115             ret++;
2116         }
2117         LEPT_FREE(path);
2118     }
2119     sarrayDestroy(&sa);
2120     return ret;
2121 }
2122 
2123 
2124 /*!
2125  * \brief   lept_rm()
2126  *
2127  * \param[in]    subdir [optional] of '/tmp'; can be NULL
2128  * \param[in]    tail filename without the directory
2129  * \return  0 on success, non-zero on failure
2130  *
2131  * <pre>
2132  * Notes:
2133  *      (1) By calling genPathname(), this does an automatic directory
2134  *          translation on windows to a path in the windows <Temp> directory:
2135  *             "/tmp/..."  ==>  <Temp>/... (windows)
2136  * </pre>
2137  */
2138 l_int32
lept_rm(const char * subdir,const char * tail)2139 lept_rm(const char  *subdir,
2140         const char  *tail)
2141 {
2142 char    *path;
2143 char     newtemp[256];
2144 l_int32  ret;
2145 
2146     PROCNAME("lept_rm");
2147 
2148     if (!tail || strlen(tail) == 0)
2149         return ERROR_INT("tail undefined or empty", procName, 1);
2150 
2151     if (makeTempDirname(newtemp, 256, subdir))
2152         return ERROR_INT("temp dirname not made", procName, 1);
2153     path = genPathname(newtemp, tail);
2154     ret = lept_rmfile(path);
2155     LEPT_FREE(path);
2156     return ret;
2157 }
2158 
2159 
2160 /*!
2161  * \brief
2162  *
2163  *  lept_rmfile()
2164  *
2165  * \param[in]    filepath full path to file including the directory
2166  * \return  0 on success, non-zero on failure
2167  *
2168  * <pre>
2169  * Notes:
2170  *      (1) This removes the named file.
2171  *      (2) Use unix pathname separators.
2172  *      (3) There is no name translation.
2173  *      (4) Unlike the other lept_* functions in this section, this can remove
2174  *          any file -- it is not restricted to files that are in /tmp or a
2175  *          subdirectory of it.
2176  * </pre>
2177  */
2178 l_int32
lept_rmfile(const char * filepath)2179 lept_rmfile(const char  *filepath)
2180 {
2181 l_int32  ret;
2182 
2183     PROCNAME("lept_rmfile");
2184 
2185     if (!filepath || strlen(filepath) == 0)
2186         return ERROR_INT("filepath undefined or empty", procName, 1);
2187 
2188 #ifndef _WIN32
2189     ret = remove(filepath);
2190 #else
2191         /* Set attributes to allow deletion of read-only files */
2192     SetFileAttributes(filepath, FILE_ATTRIBUTE_NORMAL);
2193     ret = DeleteFile(filepath) ? 0 : 1;
2194 #endif  /* !_WIN32 */
2195 
2196     return ret;
2197 }
2198 
2199 
2200 /*!
2201  * \brief   lept_mv()
2202  *
2203  * \param[in]    srcfile
2204  * \param[in]    newdir [optional]; can be NULL
2205  * \param[in]    newtail [optional]; can be NULL
2206  * \param[out]   pnewpath [optional] of actual path; can be NULL
2207  * \return  0 on success, non-zero on failure
2208  *
2209  * <pre>
2210  * Notes:
2211  *      (1) This moves %srcfile to /tmp or to a subdirectory of /tmp.
2212  *      (2) %srcfile can either be a full path or relative to the
2213  *          current directory.
2214  *      (3) %newdir can either specify an existing subdirectory of /tmp
2215  *          or can be NULL.  In the latter case, the file will be written
2216  *          into /tmp.
2217  *      (4) %newtail can either specify a filename tail or, if NULL,
2218  *          the filename is taken from src-tail, the tail of %srcfile.
2219  *      (5) For debugging, the computed newpath can be returned.  It must
2220  *          be freed by the caller.
2221  *      (6) Reminders:
2222  *          (a) specify files using unix pathnames
2223  *          (b) for windows, translates
2224  *                 /tmp  ==>  <Temp>
2225  *              where <Temp> is the windows temp directory
2226  *      (7) Examples:
2227  *          * newdir = NULL,    newtail = NULL    ==> /tmp/src-tail
2228  *          * newdir = NULL,    newtail = abc     ==> /tmp/abc
2229  *          * newdir = def/ghi, newtail = NULL    ==> /tmp/def/ghi/src-tail
2230  *          * newdir = def/ghi, newtail = abc     ==> /tmp/def/ghi/abc
2231  * </pre>
2232  */
2233 l_int32
lept_mv(const char * srcfile,const char * newdir,const char * newtail,char ** pnewpath)2234 lept_mv(const char  *srcfile,
2235         const char  *newdir,
2236         const char  *newtail,
2237         char       **pnewpath)
2238 {
2239 char    *srcpath, *newpath, *realpath, *dir, *srctail;
2240 char     newtemp[256];
2241 l_int32  ret;
2242 
2243     PROCNAME("lept_mv");
2244 
2245     if (!srcfile)
2246         return ERROR_INT("srcfile not defined", procName, 1);
2247 
2248         /* Require output pathname to be in /tmp/ or a subdirectory */
2249     if (makeTempDirname(newtemp, 256, newdir) == 1)
2250         return ERROR_INT("newdir not NULL or a subdir of /tmp", procName, 1);
2251 
2252         /* Get canonical src pathname */
2253     splitPathAtDirectory(srcfile, &dir, &srctail);
2254 
2255 #ifndef _WIN32
2256     srcpath = pathJoin(dir, srctail);
2257     LEPT_FREE(dir);
2258 
2259         /* Generate output pathname */
2260     if (!newtail || newtail[0] == '\0')
2261         newpath = pathJoin(newtemp, srctail);
2262     else
2263         newpath = pathJoin(newtemp, newtail);
2264     LEPT_FREE(srctail);
2265 
2266         /* Overwrite any existing file at 'newpath' */
2267     ret = fileCopy(srcpath, newpath);
2268     if (!ret) {
2269         realpath = genPathname(srcpath, NULL);
2270         remove(realpath);
2271         LEPT_FREE(realpath);
2272     }
2273 #else
2274     srcpath = genPathname(dir, srctail);
2275     LEPT_FREE(dir);
2276 
2277         /* Generate output pathname */
2278     if (!newtail || newtail[0] == '\0')
2279         newpath = genPathname(newtemp, srctail);
2280     else
2281         newpath = genPathname(newtemp, newtail);
2282     LEPT_FREE(srctail);
2283 
2284         /* Overwrite any existing file at 'newpath' */
2285     ret = MoveFileEx(srcpath, newpath,
2286                      MOVEFILE_COPY_ALLOWED | MOVEFILE_REPLACE_EXISTING) ? 0 : 1;
2287 #endif  /* ! _WIN32 */
2288 
2289     LEPT_FREE(srcpath);
2290     if (pnewpath)
2291         *pnewpath = newpath;
2292     else
2293         LEPT_FREE(newpath);
2294     return ret;
2295 }
2296 
2297 
2298 /*!
2299  * \brief   lept_cp()
2300  *
2301  * \param[in]    srcfile
2302  * \param[in]    newdir [optional]; can be NULL
2303  * \param[in]    newtail [optional]; can be NULL
2304  * \param[out]   pnewpath [optional] of actual path; can be NULL
2305  * \return  0 on success, non-zero on failure
2306  *
2307  * <pre>
2308  * Notes:
2309  *      (1) This copies %srcfile to /tmp or to a subdirectory of /tmp.
2310  *      (2) %srcfile can either be a full path or relative to the
2311  *          current directory.
2312  *      (3) %newdir can either specify an existing subdirectory of /tmp,
2313  *          or can be NULL.  In the latter case, the file will be written
2314  *          into /tmp.
2315  *      (4) %newtail can either specify a filename tail or, if NULL,
2316  *          the filename is taken from src-tail, the tail of %srcfile.
2317  *      (5) For debugging, the computed newpath can be returned.  It must
2318  *          be freed by the caller.
2319  *      (6) Reminders:
2320  *          (a) specify files using unix pathnames
2321  *          (b) for windows, translates
2322  *                 /tmp  ==>  <Temp>
2323  *              where <Temp> is the windows temp directory
2324  *      (7) Examples:
2325  *          * newdir = NULL,    newtail = NULL    ==> /tmp/src-tail
2326  *          * newdir = NULL,    newtail = abc     ==> /tmp/abc
2327  *          * newdir = def/ghi, newtail = NULL    ==> /tmp/def/ghi/src-tail
2328  *          * newdir = def/ghi, newtail = abc     ==> /tmp/def/ghi/abc
2329  *
2330  * </pre>
2331  */
2332 l_int32
lept_cp(const char * srcfile,const char * newdir,const char * newtail,char ** pnewpath)2333 lept_cp(const char  *srcfile,
2334         const char  *newdir,
2335         const char  *newtail,
2336         char       **pnewpath)
2337 {
2338 char    *srcpath, *newpath, *dir, *srctail;
2339 char     newtemp[256];
2340 l_int32  ret;
2341 
2342     PROCNAME("lept_cp");
2343 
2344     if (!srcfile)
2345         return ERROR_INT("srcfile not defined", procName, 1);
2346 
2347         /* Require output pathname to be in /tmp or a subdirectory */
2348     if (makeTempDirname(newtemp, 256, newdir) == 1)
2349         return ERROR_INT("newdir not NULL or a subdir of /tmp", procName, 1);
2350 
2351        /* Get canonical src pathname */
2352     splitPathAtDirectory(srcfile, &dir, &srctail);
2353 
2354 #ifndef _WIN32
2355     srcpath = pathJoin(dir, srctail);
2356     LEPT_FREE(dir);
2357 
2358         /* Generate output pathname */
2359     if (!newtail || newtail[0] == '\0')
2360         newpath = pathJoin(newtemp, srctail);
2361     else
2362         newpath = pathJoin(newtemp, newtail);
2363     LEPT_FREE(srctail);
2364 
2365         /* Overwrite any existing file at 'newpath' */
2366     ret = fileCopy(srcpath, newpath);
2367 #else
2368     srcpath = genPathname(dir, srctail);
2369     LEPT_FREE(dir);
2370 
2371         /* Generate output pathname */
2372     if (!newtail || newtail[0] == '\0')
2373         newpath = genPathname(newtemp, srctail);
2374     else
2375         newpath = genPathname(newtemp, newtail);
2376     LEPT_FREE(srctail);
2377 
2378         /* Overwrite any existing file at 'newpath' */
2379     ret = CopyFile(srcpath, newpath, FALSE) ? 0 : 1;
2380 #endif   /* !_WIN32 */
2381 
2382     LEPT_FREE(srcpath);
2383     if (pnewpath)
2384         *pnewpath = newpath;
2385     else
2386         LEPT_FREE(newpath);
2387     return ret;
2388 }
2389 
2390 
2391 /*--------------------------------------------------------------------*
2392  *                     General file name operations                   *
2393  *--------------------------------------------------------------------*/
2394 /*!
2395  * \brief   splitPathAtDirectory()
2396  *
2397  * \param[in]    pathname  full path; can be a directory
2398  * \param[out]   pdir  [optional] root directory name of
2399  *                     input path, including trailing '/'
2400  * \param[out]   ptail [optional] path tail, which is either
2401  *                     the file name within the root directory or
2402  *                     the last sub-directory in the path
2403  * \return  0 if OK, 1 on error
2404  *
2405  * <pre>
2406  * Notes:
2407  *      (1) If you only want the tail, input null for the root directory ptr.
2408  *      (2) If you only want the root directory name, input null for the
2409  *          tail ptr.
2410  *      (3) This function makes decisions based only on the lexical
2411  *          structure of the input.  Examples:
2412  *            /usr/tmp/abc  -->  dir: /usr/tmp/       tail: abc
2413  *            /usr/tmp/     -->  dir: /usr/tmp/       tail: [empty string]
2414  *            /usr/tmp      -->  dir: /usr/           tail: tmp
2415  *            abc           -->  dir: [empty string]  tail: abc
2416  *      (4) The input can have either forward (unix) or backward (win)
2417  *          slash separators.  The output has unix separators.
2418  *          Note that Win32 pathname functions generally accept both
2419  *          slash forms, but the windows command line interpreter
2420  *          only accepts backward slashes, because forward slashes are
2421  *          used to demarcate switches (vs. dashes in unix).
2422  * </pre>
2423  */
2424 l_int32
splitPathAtDirectory(const char * pathname,char ** pdir,char ** ptail)2425 splitPathAtDirectory(const char  *pathname,
2426                      char       **pdir,
2427                      char       **ptail)
2428 {
2429 char  *cpathname, *lastslash;
2430 
2431     PROCNAME("splitPathAtDirectory");
2432 
2433     if (!pdir && !ptail)
2434         return ERROR_INT("null input for both strings", procName, 1);
2435     if (pdir) *pdir = NULL;
2436     if (ptail) *ptail = NULL;
2437     if (!pathname)
2438         return ERROR_INT("pathname not defined", procName, 1);
2439 
2440     cpathname = stringNew(pathname);
2441     convertSepCharsInPath(cpathname, UNIX_PATH_SEPCHAR);
2442     lastslash = strrchr(cpathname, '/');
2443     if (lastslash) {
2444         if (ptail)
2445             *ptail = stringNew(lastslash + 1);
2446         if (pdir) {
2447             *(lastslash + 1) = '\0';
2448             *pdir = cpathname;
2449         } else {
2450             LEPT_FREE(cpathname);
2451         }
2452     } else {  /* no directory */
2453         if (pdir)
2454             *pdir = stringNew("");
2455         if (ptail)
2456             *ptail = cpathname;
2457         else
2458             LEPT_FREE(cpathname);
2459     }
2460 
2461     return 0;
2462 }
2463 
2464 
2465 /*!
2466  * \brief   splitPathAtExtension()
2467  *
2468  * \param[in]    pathname full path; can be a directory
2469  * \param[out]   pbasename [optional] pathname not including the
2470  *                        last dot and characters after that
2471  * \param[out]   pextension [optional] path extension, which is
2472  *                        the last dot and the characters after it.  If
2473  *                        there is no extension, it returns the empty string
2474  * \return  0 if OK, 1 on error
2475  *
2476  * <pre>
2477  * Notes:
2478  *      (1) If you only want the extension, input null for the basename ptr.
2479  *      (2) If you only want the basename without extension, input null
2480  *          for the extension ptr.
2481  *      (3) This function makes decisions based only on the lexical
2482  *          structure of the input.  Examples:
2483  *            /usr/tmp/abc.jpg  -->  basename: /usr/tmp/abc    ext: .jpg
2484  *            /usr/tmp/.jpg     -->  basename: /usr/tmp/       ext: .jpg
2485  *            /usr/tmp.jpg/     -->  basename: /usr/tmp.jpg/   ext: [empty str]
2486  *            ./.jpg            -->  basename: ./              ext: .jpg
2487  *      (4) The input can have either forward (unix) or backward (win)
2488  *          slash separators.  The output has unix separators.
2489  * </pre>
2490  */
2491 l_int32
splitPathAtExtension(const char * pathname,char ** pbasename,char ** pextension)2492 splitPathAtExtension(const char  *pathname,
2493                      char       **pbasename,
2494                      char       **pextension)
2495 {
2496 char  *tail, *dir, *lastdot;
2497 char   empty[4] = "";
2498 
2499     PROCNAME("splitPathExtension");
2500 
2501     if (!pbasename && !pextension)
2502         return ERROR_INT("null input for both strings", procName, 1);
2503     if (pbasename) *pbasename = NULL;
2504     if (pextension) *pextension = NULL;
2505     if (!pathname)
2506         return ERROR_INT("pathname not defined", procName, 1);
2507 
2508         /* Split out the directory first */
2509     splitPathAtDirectory(pathname, &dir, &tail);
2510 
2511         /* Then look for a "." in the tail part.
2512          * This way we ignore all "." in the directory. */
2513     if ((lastdot = strrchr(tail, '.'))) {
2514         if (pextension)
2515             *pextension = stringNew(lastdot);
2516         if (pbasename) {
2517             *lastdot = '\0';
2518             *pbasename = stringJoin(dir, tail);
2519         }
2520     } else {
2521         if (pextension)
2522             *pextension = stringNew(empty);
2523         if (pbasename)
2524             *pbasename = stringNew(pathname);
2525     }
2526     LEPT_FREE(dir);
2527     LEPT_FREE(tail);
2528     return 0;
2529 }
2530 
2531 
2532 /*!
2533  * \brief   pathJoin()
2534  *
2535  * \param[in]    dir [optional] can be null
2536  * \param[in]    fname [optional] can be null
2537  * \return  specially concatenated path, or NULL on error
2538  *
2539  * <pre>
2540  * Notes:
2541  *      (1) Use unix-style pathname separators ('/').
2542  *      (2) %fname can be the entire path, or part of the path containing
2543  *          at least one directory, or a tail without a directory, or NULL.
2544  *      (3) It produces a path that strips multiple slashes to a single
2545  *          slash, joins %dir and %fname by a slash, and has no trailing
2546  *          slashes (except in the cases where %dir == "/" and
2547  *          %fname == NULL, or v.v.).
2548  *      (4) If both %dir and %fname are null, produces an empty string.
2549  *      (5) Neither %dir nor %fname can begin with '..'.
2550  *      (6) The result is not canonicalized or tested for correctness:
2551  *          garbage in (e.g., /&%), garbage out.
2552  *      (7) Examples:
2553  *             //tmp// + //abc/  -->  /tmp/abc
2554  *             tmp/ + /abc/      -->  tmp/abc
2555  *             tmp/ + abc/       -->  tmp/abc
2556  *             /tmp/ + ///       -->  /tmp
2557  *             /tmp/ + NULL      -->  /tmp
2558  *             // + /abc//       -->  /abc
2559  *             // + NULL         -->  /
2560  *             NULL + /abc/def/  -->  /abc/def
2561  *             NULL + abc//      -->  abc
2562  *             NULL + //         -->  /
2563  *             NULL + NULL       -->  (empty string)
2564  *             "" + ""           -->  (empty string)
2565  *             "" + /            -->  /
2566  *             ".." + /etc/foo   -->  NULL
2567  *             /tmp + ".."       -->  NULL
2568  * </pre>
2569  */
2570 char *
pathJoin(const char * dir,const char * fname)2571 pathJoin(const char  *dir,
2572          const char  *fname)
2573 {
2574 char     *slash = (char *)"/";
2575 char     *str, *dest;
2576 l_int32   i, n1, n2, emptydir;
2577 size_t    size;
2578 SARRAY   *sa1, *sa2;
2579 L_BYTEA  *ba;
2580 
2581     PROCNAME("pathJoin");
2582 
2583     if (!dir && !fname)
2584         return stringNew("");
2585     if (dir && strlen(dir) >= 2 && dir[0] == '.' && dir[1] == '.')
2586         return (char *)ERROR_PTR("dir starts with '..'", procName, NULL);
2587     if (fname && strlen(fname) >= 2 && fname[0] == '.' && fname[1] == '.')
2588         return (char *)ERROR_PTR("fname starts with '..'", procName, NULL);
2589 
2590     sa1 = sarrayCreate(0);
2591     sa2 = sarrayCreate(0);
2592     ba = l_byteaCreate(4);
2593 
2594         /* Process %dir */
2595     if (dir && strlen(dir) > 0) {
2596         if (dir[0] == '/')
2597             l_byteaAppendString(ba, slash);
2598         sarraySplitString(sa1, dir, "/");  /* removes all slashes */
2599         n1 = sarrayGetCount(sa1);
2600         for (i = 0; i < n1; i++) {
2601             str = sarrayGetString(sa1, i, L_NOCOPY);
2602             l_byteaAppendString(ba, str);
2603             l_byteaAppendString(ba, slash);
2604         }
2605     }
2606 
2607         /* Special case to add leading slash: dir NULL or empty string  */
2608     emptydir = dir && strlen(dir) == 0;
2609     if ((!dir || emptydir) && fname && strlen(fname) > 0 && fname[0] == '/')
2610         l_byteaAppendString(ba, slash);
2611 
2612         /* Process %fname */
2613     if (fname && strlen(fname) > 0) {
2614         sarraySplitString(sa2, fname, "/");
2615         n2 = sarrayGetCount(sa2);
2616         for (i = 0; i < n2; i++) {
2617             str = sarrayGetString(sa2, i, L_NOCOPY);
2618             l_byteaAppendString(ba, str);
2619             l_byteaAppendString(ba, slash);
2620         }
2621     }
2622 
2623         /* Remove trailing slash */
2624     dest = (char *)l_byteaCopyData(ba, &size);
2625     if (size > 1 && dest[size - 1] == '/')
2626         dest[size - 1] = '\0';
2627 
2628     sarrayDestroy(&sa1);
2629     sarrayDestroy(&sa2);
2630     l_byteaDestroy(&ba);
2631     return dest;
2632 }
2633 
2634 
2635 /*!
2636  * \brief   appendSubdirs()
2637  *
2638  * \param[in]    basedir
2639  * \param[in]    subdirs
2640  * \return  concatenated full directory path without trailing slash,
2641  *              or NULL on error
2642  *
2643  * <pre>
2644  * Notes:
2645  *      (1) Use unix pathname separators
2646  *      (2) Allocates a new string:  <basedir>/<subdirs>
2647  * </pre>
2648  */
2649 char *
appendSubdirs(const char * basedir,const char * subdirs)2650 appendSubdirs(const char  *basedir,
2651               const char  *subdirs)
2652 {
2653 char   *newdir;
2654 size_t  len1, len2, len3, len4;
2655 
2656     PROCNAME("appendSubdirs");
2657 
2658     if (!basedir || !subdirs)
2659         return (char *)ERROR_PTR("basedir and subdirs not both defined",
2660                                  procName, NULL);
2661 
2662     len1 = strlen(basedir);
2663     len2 = strlen(subdirs);
2664     len3 = len1 + len2 + 6;
2665     if ((newdir = (char *)LEPT_CALLOC(len3 + 1, 1)) == NULL)
2666         return (char *)ERROR_PTR("newdir not made", procName, NULL);
2667     strncat(newdir, basedir, len3);  /* add basedir */
2668     if (newdir[len1 - 1] != '/')  /* add '/' if necessary */
2669         newdir[len1] = '/';
2670     if (subdirs[0] == '/')  /* add subdirs, stripping leading '/' */
2671         strncat(newdir, subdirs + 1, len3);
2672     else
2673         strncat(newdir, subdirs, len3);
2674     len4 = strlen(newdir);
2675     if (newdir[len4 - 1] == '/')  /* strip trailing '/' */
2676         newdir[len4 - 1] = '\0';
2677 
2678     return newdir;
2679 }
2680 
2681 
2682 /*--------------------------------------------------------------------*
2683  *                     Special file name operations                   *
2684  *--------------------------------------------------------------------*/
2685 /*!
2686  * \brief   convertSepCharsInPath()
2687  *
2688  * \param[in]    path
2689  * \param[in]    type UNIX_PATH_SEPCHAR, WIN_PATH_SEPCHAR
2690  * \return  0 if OK, 1 on error
2691  *
2692  * <pre>
2693  * Notes:
2694  *      (1) In-place conversion.
2695  *      (2) Type is the resulting type:
2696  *            * UNIX_PATH_SEPCHAR:  '\\' ==> '/'
2697  *            * WIN_PATH_SEPCHAR:   '/' ==> '\\'
2698  *      (3) Virtually all path operations in leptonica use unix separators.
2699  * </pre>
2700  */
2701 l_int32
convertSepCharsInPath(char * path,l_int32 type)2702 convertSepCharsInPath(char    *path,
2703                       l_int32  type)
2704 {
2705 l_int32  i;
2706 size_t   len;
2707 
2708     PROCNAME("convertSepCharsInPath");
2709     if (!path)
2710         return ERROR_INT("path not defined", procName, 1);
2711     if (type != UNIX_PATH_SEPCHAR && type != WIN_PATH_SEPCHAR)
2712         return ERROR_INT("invalid type", procName, 1);
2713 
2714     len = strlen(path);
2715     if (type == UNIX_PATH_SEPCHAR) {
2716         for (i = 0; i < len; i++) {
2717             if (path[i] == '\\')
2718                 path[i] = '/';
2719         }
2720     } else {  /* WIN_PATH_SEPCHAR */
2721         for (i = 0; i < len; i++) {
2722             if (path[i] == '/')
2723                 path[i] = '\\';
2724         }
2725     }
2726     return 0;
2727 }
2728 
2729 
2730 /*!
2731  * \brief   genPathname()
2732  *
2733  * \param[in]    dir [optional] directory or full path name, with or without
2734  *                   trailing '/'
2735  * \param[in]    fname [optional] file name within a directory
2736  * \return  pathname either a directory or full path, or NULL on error
2737  *
2738  * <pre>
2739  * Notes:
2740  *      (1) This function generates actual paths in the following ways:
2741  *            * from two sub-parts (e.g., a directory and a file name).
2742  *            * from a single path full path, placed in %dir, with
2743  *              %fname == NULL.
2744  *            * from the name of a file in the local directory placed in
2745  *              %fname, with %dir == NULL.
2746  *            * if in a "/tmp" directory and on windows, the windows
2747  *              temp directory is used.
2748  *      (2) On windows, if the root of %dir is '/tmp', this does a name
2749  *          translation:
2750  *             "/tmp"  ==>  <Temp> (windows)
2751  *          where <Temp> is the windows temp directory.
2752  *      (3) On unix, the TMPDIR variable is ignored.  No rewriting
2753  *          of temp directories is permitted.
2754  *      (4) There are four cases for the input:
2755  *          (a) %dir is a directory and %fname is defined: result is a full path
2756  *          (b) %dir is a directory and %fname is null: result is a directory
2757  *          (c) %dir is a full path and %fname is null: result is a full path
2758  *          (d) %dir is null or an empty string: start in the current dir;
2759  *              result is a full path
2760  *      (5) In all cases, the resulting pathname is not terminated with a slash
2761  *      (6) The caller is responsible for freeing the returned pathname.
2762  * </pre>
2763  */
2764 char *
genPathname(const char * dir,const char * fname)2765 genPathname(const char  *dir,
2766             const char  *fname)
2767 {
2768 l_int32  is_win32 = FALSE;
2769 char    *cdir, *pathout;
2770 l_int32  dirlen, namelen, size;
2771 
2772     PROCNAME("genPathname");
2773 
2774     if (!dir && !fname)
2775         return (char *)ERROR_PTR("no input", procName, NULL);
2776 
2777         /* Handle the case where we start from the current directory */
2778     if (!dir || dir[0] == '\0') {
2779         if ((cdir = getcwd(NULL, 0)) == NULL)
2780             return (char *)ERROR_PTR("no current dir found", procName, NULL);
2781     } else {
2782         cdir = stringNew(dir);
2783     }
2784 
2785         /* Convert to unix path separators, and remove the trailing
2786          * slash in the directory, except when dir == "/"  */
2787     convertSepCharsInPath(cdir, UNIX_PATH_SEPCHAR);
2788     dirlen = strlen(cdir);
2789     if (cdir[dirlen - 1] == '/' && dirlen != 1) {
2790         cdir[dirlen - 1] = '\0';
2791         dirlen--;
2792     }
2793 
2794     namelen = (fname) ? strlen(fname) : 0;
2795     size = dirlen + namelen + 256;
2796     if ((pathout = (char *)LEPT_CALLOC(size, sizeof(char))) == NULL) {
2797         LEPT_FREE(cdir);
2798         return (char *)ERROR_PTR("pathout not made", procName, NULL);
2799     }
2800 
2801 #ifdef _WIN32
2802     is_win32 = TRUE;
2803 #endif  /* _WIN32 */
2804 
2805         /* First handle %dir (which may be a full pathname).
2806          * There is no path rewriting on unix, and on win32, we do not
2807          * rewrite unless the specified directory is /tmp or
2808          * a subdirectory of /tmp */
2809     if (!is_win32 || dirlen < 4 ||
2810         (dirlen == 4 && strncmp(cdir, "/tmp", 4) != 0) ||  /* not in "/tmp" */
2811         (dirlen > 4 && strncmp(cdir, "/tmp/", 5) != 0)) {  /* not in "/tmp/" */
2812         stringCopy(pathout, cdir, dirlen);
2813     } else {  /* Rewrite for win32 with "/tmp" specified for the directory. */
2814 #ifdef _WIN32
2815         l_int32 tmpdirlen;
2816         char tmpdir[MAX_PATH];
2817         GetTempPath(sizeof(tmpdir), tmpdir);  /* get the windows temp dir */
2818         tmpdirlen = strlen(tmpdir);
2819         if (tmpdirlen > 0 && tmpdir[tmpdirlen - 1] == '\\') {
2820             tmpdir[tmpdirlen - 1] = '\0';  /* trim the trailing '\' */
2821         }
2822         tmpdirlen = strlen(tmpdir);
2823         stringCopy(pathout, tmpdir, tmpdirlen);
2824 
2825             /* Add the rest of cdir */
2826         if (dirlen > 4)
2827             stringCat(pathout, size, cdir + 4);
2828 #endif  /* _WIN32 */
2829     }
2830 
2831         /* Now handle %fname */
2832     if (fname && strlen(fname) > 0) {
2833         dirlen = strlen(pathout);
2834         pathout[dirlen] = '/';
2835         strncat(pathout, fname, namelen);
2836     }
2837 
2838     LEPT_FREE(cdir);
2839     return pathout;
2840 }
2841 
2842 
2843 /*!
2844  * \brief   makeTempDirname()
2845  *
2846  * \param[in]    result preallocated on stack or heap and passed in
2847  * \param[in]    nbytes size of %result array, in bytes
2848  * \param[in]    subdir [optional]; can be NULL or an empty string
2849  * \return  0 if OK, 1 on error
2850  *
2851  * <pre>
2852  * Notes:
2853  *      (1) This generates the directory path for output temp files,
2854  *          written into %result with unix separators.
2855  *      (2) Caller allocates %result, large enough to hold the path,
2856  *          which is:
2857  *            /tmp/%subdir       (unix)
2858  *            <Temp>/%subdir     (windows)
2859  *          where <Temp> is a path on windows determined by GenTempPath()
2860  *          and %subdir is in general a set of nested subdirectories:
2861  *            dir1/dir2/.../dirN
2862  *          which in use would not typically exceed 2 levels.
2863  *      (3) Usage example:
2864  * \code
2865  *           char  result[256];
2866  *           makeTempDirname(result, 256, "lept/golden");
2867  * \endcode
2868  * </pre>
2869  */
2870 l_int32
makeTempDirname(char * result,size_t nbytes,const char * subdir)2871 makeTempDirname(char        *result,
2872                 size_t       nbytes,
2873                 const char  *subdir)
2874 {
2875 char    *dir, *path;
2876 l_int32  ret = 0;
2877 size_t   pathlen;
2878 
2879     PROCNAME("makeTempDirname");
2880 
2881     if (!result)
2882         return ERROR_INT("result not defined", procName, 1);
2883     if (subdir && ((subdir[0] == '.') || (subdir[0] == '/')))
2884         return ERROR_INT("subdir not an actual subdirectory", procName, 1);
2885 
2886     memset(result, 0, nbytes);
2887     dir = pathJoin("/tmp", subdir);
2888 #ifndef _WIN32
2889     path = stringNew(dir);
2890 #else
2891     path = genPathname(dir, NULL);
2892 #endif  /*  ~ _WIN32 */
2893     pathlen = strlen(path);
2894     if (pathlen < nbytes - 1) {
2895         strncpy(result, path, pathlen);
2896     } else {
2897         L_ERROR("result array too small for path\n", procName);
2898         ret = 1;
2899     }
2900 
2901     LEPT_FREE(dir);
2902     LEPT_FREE(path);
2903     return ret;
2904 }
2905 
2906 
2907 /*!
2908  * \brief   modifyTrailingSlash()
2909  *
2910  * \param[in]    path preallocated on stack or heap and passed in
2911  * \param[in]    nbytes size of %path array, in bytes
2912  * \param[in]    flag L_ADD_TRAIL_SLASH or L_REMOVE_TRAIL_SLASH
2913  * \return  0 if OK, 1 on error
2914  *
2915  * <pre>
2916  * Notes:
2917  *      (1) This carries out the requested action if necessary.
2918  * </pre>
2919  */
2920 l_int32
modifyTrailingSlash(char * path,size_t nbytes,l_int32 flag)2921 modifyTrailingSlash(char    *path,
2922                     size_t   nbytes,
2923                     l_int32  flag)
2924 {
2925 char    lastchar;
2926 size_t  len;
2927 
2928     PROCNAME("modifyTrailingSlash");
2929 
2930     if (!path)
2931         return ERROR_INT("path not defined", procName, 1);
2932     if (flag != L_ADD_TRAIL_SLASH && flag != L_REMOVE_TRAIL_SLASH)
2933         return ERROR_INT("invalid flag", procName, 1);
2934 
2935     len = strlen(path);
2936     lastchar = path[len - 1];
2937     if (flag == L_ADD_TRAIL_SLASH && lastchar != '/' && len < nbytes - 2) {
2938         path[len] = '/';
2939         path[len + 1] = '\0';
2940     } else if (flag == L_REMOVE_TRAIL_SLASH && lastchar == '/') {
2941         path[len - 1] = '\0';
2942     }
2943     return 0;
2944 }
2945 
2946 
2947 /*!
2948  * \brief   l_makeTempFilename()
2949  *
2950  * \return  fname : heap allocated filename; returns NULL on failure.
2951  *
2952  * <pre>
2953  * Notes:
2954  *      (1) On unix, this makes a filename of the form
2955  *               "/tmp/lept.XXXXXX",
2956  *          where each X is a random character.
2957  *      (2) On windows, this makes a filename of the form
2958  *               "/<Temp>/lp.XXXXXX".
2959  *      (3) On all systems, this fails if the file is not writable.
2960  *      (4) Safest usage is to write to a subdirectory in debug code.
2961  *      (5) The returned filename must be freed by the caller, using lept_free.
2962  *      (6) The tail of the filename has a '.', so that cygwin interprets
2963  *          the file as having an extension.  Otherwise, cygwin assumes it
2964  *          is an executable and appends ".exe" to the filename.
2965  *      (7) On unix, whenever possible use tmpfile() instead.  tmpfile()
2966  *          hides the file name, returns a stream opened for write,
2967  *          and deletes the temp file when the stream is closed.
2968  */
2969 char *
l_makeTempFilename()2970 l_makeTempFilename()
2971 {
2972 char  dirname[240];
2973 
2974     PROCNAME("l_makeTempFilename");
2975 
2976     if (makeTempDirname(dirname, sizeof(dirname), NULL) == 1)
2977         return (char *)ERROR_PTR("failed to make dirname", procName, NULL);
2978 
2979 #ifndef _WIN32
2980 {
2981     char    *pattern;
2982     l_int32  fd;
2983     pattern = stringConcatNew(dirname, "/lept.XXXXXX", NULL);
2984     fd = mkstemp(pattern);
2985     if (fd == -1) {
2986         LEPT_FREE(pattern);
2987         return (char *)ERROR_PTR("mkstemp failed", procName, NULL);
2988     }
2989     close(fd);
2990     return pattern;
2991 }
2992 #else
2993 {
2994     char  fname[MAX_PATH];
2995     FILE *fp;
2996     if (GetTempFileName(dirname, "lp.", 0, fname) == 0)
2997         return (char *)ERROR_PTR("GetTempFileName failed", procName, NULL);
2998     if ((fp = fopen(fname, "wb")) == NULL)
2999         return (char *)ERROR_PTR("file cannot be written to", procName, NULL);
3000     fclose(fp);
3001     return stringNew(fname);
3002 }
3003 #endif  /*  ~ _WIN32 */
3004 }
3005 
3006 
3007 /*!
3008  * \brief   extractNumberFromFilename()
3009  *
3010  * \param[in]    fname
3011  * \param[in]    numpre number of characters before the digits to be found
3012  * \param[in]    numpost number of characters after the digits to be found
3013  * \return  num number embedded in the filename; -1 on error or if
3014  *                   not found
3015  *
3016  * <pre>
3017  * Notes:
3018  *      (1) The number is to be found in the basename, which is the
3019  *          filename without either the directory or the last extension.
3020  *      (2) When a number is found, it is non-negative.  If no number
3021  *          is found, this returns -1, without an error message.  The
3022  *          caller needs to check.
3023  * </pre>
3024  */
3025 l_int32
extractNumberFromFilename(const char * fname,l_int32 numpre,l_int32 numpost)3026 extractNumberFromFilename(const char  *fname,
3027                           l_int32      numpre,
3028                           l_int32      numpost)
3029 {
3030 char    *tail, *basename;
3031 l_int32  len, nret, num;
3032 
3033     PROCNAME("extractNumberFromFilename");
3034 
3035     if (!fname)
3036         return ERROR_INT("fname not defined", procName, -1);
3037 
3038     splitPathAtDirectory(fname, NULL, &tail);
3039     splitPathAtExtension(tail, &basename, NULL);
3040     LEPT_FREE(tail);
3041 
3042     len = strlen(basename);
3043     if (numpre + numpost > len - 1) {
3044         LEPT_FREE(basename);
3045         return ERROR_INT("numpre + numpost too big", procName, -1);
3046     }
3047 
3048     basename[len - numpost] = '\0';
3049     nret = sscanf(basename + numpre, "%d", &num);
3050     LEPT_FREE(basename);
3051 
3052     if (nret == 1)
3053         return num;
3054     else
3055         return -1;  /* not found */
3056 }
3057