1 /* readtokens.c  -- Functions for reading tokens from an input stream.
2 
3    Copyright (C) 1990-1991, 1999-2004, 2006, 2009-2021 Free Software
4    Foundation, Inc.
5 
6    This program is free software: you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3 of the License, or
9    (at your option) any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program.  If not, see <https://www.gnu.org/licenses/>.
18 
19    Written by Jim Meyering. */
20 
21 /* This almost supersedes xreadline stuff -- using delim="\n"
22    gives the same functionality, except that these functions
23    would never return empty lines. */
24 
25 #include <config.h>
26 
27 #include "readtokens.h"
28 
29 #include <limits.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <stdbool.h>
34 
35 #include "xalloc.h"
36 
37 #if USE_UNLOCKED_IO
38 # include "unlocked-io.h"
39 #endif
40 
41 /* Initialize a tokenbuffer. */
42 
43 void
init_tokenbuffer(token_buffer * tokenbuffer)44 init_tokenbuffer (token_buffer *tokenbuffer)
45 {
46   tokenbuffer->size = 0;
47   tokenbuffer->buffer = NULL;
48 }
49 
50 typedef size_t word;
51 enum { bits_per_word = sizeof (word) * CHAR_BIT };
52 
53 static bool
get_nth_bit(size_t n,word const * bitset)54 get_nth_bit (size_t n, word const *bitset)
55 {
56   return bitset[n / bits_per_word] >> n % bits_per_word & 1;
57 }
58 
59 static void
set_nth_bit(size_t n,word * bitset)60 set_nth_bit (size_t n, word *bitset)
61 {
62   size_t one = 1;
63   bitset[n / bits_per_word] |= one << n % bits_per_word;
64 }
65 
66 /* Read a token from STREAM into TOKENBUFFER.
67    A token is delimited by any of the N_DELIM bytes in DELIM.
68    Upon return, the token is in tokenbuffer->buffer and
69    has a trailing '\0' instead of any original delimiter.
70    The function value is the length of the token not including
71    the final '\0'.  Upon EOF (i.e. on the call after the last
72    token is read) or error, return -1 without modifying tokenbuffer.
73    The EOF and error conditions may be distinguished in the caller
74    by testing ferror (STREAM).
75 
76    This function works properly on lines containing NUL bytes
77    and on files that do not end with a delimiter.  */
78 
79 size_t
readtoken(FILE * stream,const char * delim,size_t n_delim,token_buffer * tokenbuffer)80 readtoken (FILE *stream,
81            const char *delim,
82            size_t n_delim,
83            token_buffer *tokenbuffer)
84 {
85   int c;
86   idx_t i;
87   word isdelim[(UCHAR_MAX + bits_per_word) / bits_per_word];
88 
89   memset (isdelim, 0, sizeof isdelim);
90   for (i = 0; i < n_delim; i++)
91     {
92       unsigned char ch = delim[i];
93       set_nth_bit (ch, isdelim);
94     }
95 
96   /* skip over any leading delimiters */
97   for (c = getc (stream); c >= 0 && get_nth_bit (c, isdelim); c = getc (stream))
98     {
99       /* empty */
100     }
101 
102   char *p = tokenbuffer->buffer;
103   idx_t n = tokenbuffer->size;
104   i = 0;
105   for (;;)
106     {
107       if (c < 0 && i == 0)
108         return -1;
109 
110       if (i == n)
111         p = xpalloc (p, &n, 1, -1, sizeof *p);
112 
113       if (c < 0)
114         {
115           p[i] = 0;
116           break;
117         }
118       if (get_nth_bit (c, isdelim))
119         {
120           p[i] = 0;
121           break;
122         }
123       p[i++] = c;
124       c = getc (stream);
125     }
126 
127   tokenbuffer->buffer = p;
128   tokenbuffer->size = n;
129   return i;
130 }
131 
132 /* Build a NULL-terminated array of pointers to tokens
133    read from STREAM.  Return the number of tokens read.
134    All storage is obtained through calls to xmalloc-like functions.
135 
136    %%% Question: is it worth it to do a single
137    %%% realloc() of 'tokens' just before returning? */
138 
139 size_t
readtokens(FILE * stream,size_t projected_n_tokens,const char * delim,size_t n_delim,char *** tokens_out,size_t ** token_lengths)140 readtokens (FILE *stream,
141             size_t projected_n_tokens,
142             const char *delim,
143             size_t n_delim,
144             char ***tokens_out,
145             size_t **token_lengths)
146 {
147   token_buffer tb, *token = &tb;
148   char **tokens;
149   size_t *lengths;
150   idx_t sz, n_tokens;
151 
152   if (projected_n_tokens == 0)
153     projected_n_tokens = 64;
154   else
155     projected_n_tokens++;       /* add one for trailing NULL pointer */
156 
157   sz = projected_n_tokens;
158   tokens = xnmalloc (sz, sizeof *tokens);
159   lengths = xnmalloc (sz, sizeof *lengths);
160 
161   n_tokens = 0;
162   init_tokenbuffer (token);
163   for (;;)
164     {
165       char *tmp;
166       size_t token_length = readtoken (stream, delim, n_delim, token);
167       if (n_tokens >= sz)
168         {
169           tokens = xpalloc (tokens, &sz, 1, -1, sizeof *tokens);
170           lengths = xreallocarray (lengths, sz, sizeof *lengths);
171         }
172 
173       if (token_length == (size_t) -1)
174         {
175           /* don't increment n_tokens for NULL entry */
176           tokens[n_tokens] = NULL;
177           lengths[n_tokens] = 0;
178           break;
179         }
180       tmp = xnmalloc (token_length + 1, sizeof *tmp);
181       lengths[n_tokens] = token_length;
182       tokens[n_tokens] = memcpy (tmp, token->buffer, token_length + 1);
183       n_tokens++;
184     }
185 
186   free (token->buffer);
187   *tokens_out = tokens;
188   if (token_lengths != NULL)
189     *token_lengths = lengths;
190   else
191     free (lengths);
192   return n_tokens;
193 }
194