1 /* readtokens.c  -- Functions for reading tokens from an input stream.
2 
3    Copyright (C) 1990-1991, 1999-2004, 2006, 2009-2018 Free Software
4    Foundation, Inc.
5 
6    This program is free software: you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3 of the License, or
9    (at your option) any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program.  If not, see <https://www.gnu.org/licenses/>.
18 
19    Written by Jim Meyering. */
20 
21 /* This almost supersedes xreadline stuff -- using delim="\n"
22    gives the same functionality, except that these functions
23    would never return empty lines. */
24 
25 #include <config.h>
26 
27 #include "readtokens.h"
28 
29 #include <limits.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <stdbool.h>
34 
35 #include "xalloc.h"
36 
37 #if USE_UNLOCKED_IO
38 # include "unlocked-io.h"
39 #endif
40 
41 /* Initialize a tokenbuffer. */
42 
43 void
init_tokenbuffer(token_buffer * tokenbuffer)44 init_tokenbuffer (token_buffer *tokenbuffer)
45 {
46   tokenbuffer->size = 0;
47   tokenbuffer->buffer = NULL;
48 }
49 
50 typedef size_t word;
51 enum { bits_per_word = sizeof (word) * CHAR_BIT };
52 
53 static bool
get_nth_bit(size_t n,word const * bitset)54 get_nth_bit (size_t n, word const *bitset)
55 {
56   return bitset[n / bits_per_word] >> n % bits_per_word & 1;
57 }
58 
59 static void
set_nth_bit(size_t n,word * bitset)60 set_nth_bit (size_t n, word *bitset)
61 {
62   size_t one = 1;
63   bitset[n / bits_per_word] |= one << n % bits_per_word;
64 }
65 
66 /* Read a token from STREAM into TOKENBUFFER.
67    A token is delimited by any of the N_DELIM bytes in DELIM.
68    Upon return, the token is in tokenbuffer->buffer and
69    has a trailing '\0' instead of any original delimiter.
70    The function value is the length of the token not including
71    the final '\0'.  Upon EOF (i.e. on the call after the last
72    token is read) or error, return -1 without modifying tokenbuffer.
73    The EOF and error conditions may be distinguished in the caller
74    by testing ferror (STREAM).
75 
76    This function works properly on lines containing NUL bytes
77    and on files that do not end with a delimiter.  */
78 
79 size_t
readtoken(FILE * stream,const char * delim,size_t n_delim,token_buffer * tokenbuffer)80 readtoken (FILE *stream,
81            const char *delim,
82            size_t n_delim,
83            token_buffer *tokenbuffer)
84 {
85   char *p;
86   int c;
87   size_t i, n;
88   word isdelim[(UCHAR_MAX + bits_per_word) / bits_per_word];
89 
90   memset (isdelim, 0, sizeof isdelim);
91   for (i = 0; i < n_delim; i++)
92     {
93       unsigned char ch = delim[i];
94       set_nth_bit (ch, isdelim);
95     }
96 
97   /* skip over any leading delimiters */
98   for (c = getc (stream); c >= 0 && get_nth_bit (c, isdelim); c = getc (stream))
99     {
100       /* empty */
101     }
102 
103   p = tokenbuffer->buffer;
104   n = tokenbuffer->size;
105   i = 0;
106   for (;;)
107     {
108       if (c < 0 && i == 0)
109         return -1;
110 
111       if (i == n)
112         p = x2nrealloc (p, &n, sizeof *p);
113 
114       if (c < 0)
115         {
116           p[i] = 0;
117           break;
118         }
119       if (get_nth_bit (c, isdelim))
120         {
121           p[i] = 0;
122           break;
123         }
124       p[i++] = c;
125       c = getc (stream);
126     }
127 
128   tokenbuffer->buffer = p;
129   tokenbuffer->size = n;
130   return i;
131 }
132 
133 /* Build a NULL-terminated array of pointers to tokens
134    read from STREAM.  Return the number of tokens read.
135    All storage is obtained through calls to xmalloc-like functions.
136 
137    %%% Question: is it worth it to do a single
138    %%% realloc() of 'tokens' just before returning? */
139 
140 size_t
readtokens(FILE * stream,size_t projected_n_tokens,const char * delim,size_t n_delim,char *** tokens_out,size_t ** token_lengths)141 readtokens (FILE *stream,
142             size_t projected_n_tokens,
143             const char *delim,
144             size_t n_delim,
145             char ***tokens_out,
146             size_t **token_lengths)
147 {
148   token_buffer tb, *token = &tb;
149   char **tokens;
150   size_t *lengths;
151   size_t sz;
152   size_t n_tokens;
153 
154   if (projected_n_tokens == 0)
155     projected_n_tokens = 64;
156   else
157     projected_n_tokens++;       /* add one for trailing NULL pointer */
158 
159   sz = projected_n_tokens;
160   tokens = xnmalloc (sz, sizeof *tokens);
161   lengths = xnmalloc (sz, sizeof *lengths);
162 
163   n_tokens = 0;
164   init_tokenbuffer (token);
165   for (;;)
166     {
167       char *tmp;
168       size_t token_length = readtoken (stream, delim, n_delim, token);
169       if (n_tokens >= sz)
170         {
171           tokens = x2nrealloc (tokens, &sz, sizeof *tokens);
172           lengths = xnrealloc (lengths, sz, sizeof *lengths);
173         }
174 
175       if (token_length == (size_t) -1)
176         {
177           /* don't increment n_tokens for NULL entry */
178           tokens[n_tokens] = NULL;
179           lengths[n_tokens] = 0;
180           break;
181         }
182       tmp = xnmalloc (token_length + 1, sizeof *tmp);
183       lengths[n_tokens] = token_length;
184       tokens[n_tokens] = memcpy (tmp, token->buffer, token_length + 1);
185       n_tokens++;
186     }
187 
188   free (token->buffer);
189   *tokens_out = tokens;
190   if (token_lengths != NULL)
191     *token_lengths = lengths;
192   else
193     free (lengths);
194   return n_tokens;
195 }
196