1 /* GNU Datamash - perform simple calculation on input data
2
3 Copyright (C) 2013-2020 Assaf Gordon <assafgordon@gmail.com>
4
5 This file is part of GNU Datamash.
6
7 GNU Datamash is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 GNU Datamash is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GNU Datamash. If not, see <https://www.gnu.org/licenses/>.
19 */
20
21 /* Written by Assaf Gordon */
22 #include <config.h>
23 #include <string.h>
24 #include <inttypes.h>
25 #include <stdlib.h>
26 #include <stdbool.h>
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <assert.h>
30 #include <errno.h>
31
32 #include "system.h"
33
34 #include "die.h"
35 #include "op-scanner.h"
36
37 /* Used by other modules */
38 uintmax_t scan_val_int;
39 long double scan_val_float;
40 char* scanner_identifier;
41 bool scanner_keep_whitespace = false;
42
43 /* Internal */
44 static char* scanner_input;
45 static char* scan_pos;
46 static size_t scanner_identifier_len;
47 static bool have_peek;
48 static enum TOKEN scan_peek;
49
50 static inline void
set_identifier(const char * data,size_t n)51 set_identifier (const char* data, size_t n)
52 {
53 if (n>=scanner_identifier_len)
54 {
55 scanner_identifier = xrealloc (scanner_identifier,n+1);
56 scanner_identifier_len = n+1;
57 }
58 memcpy (scanner_identifier, data, n);
59 scanner_identifier[n]=0;
60 }
61
62 /* Concatante argv into one (space separated) string */
63 void
scanner_set_input_from_argv(int argc,const char * argv[])64 scanner_set_input_from_argv (int argc, const char* argv[])
65 {
66 assert (scanner_input == NULL); /* LCOV_EXCL_LINE */
67
68 size_t len = 1; /* +1 for NUL */
69 for (int i=0;i<argc;++i)
70 len += strlen (argv[i])+1; /* +1 for space */
71
72 char *p = scan_pos = scanner_input = XCALLOC (len, char);
73 for (int i=0; i<argc; ++i)
74 {
75 if (i>0)
76 p = stpcpy (p, " ");
77 p = stpcpy (p, argv[i]);
78 }
79 }
80
81 void
scanner_free()82 scanner_free ()
83 {
84 free (scanner_identifier);
85 scanner_identifier = NULL;
86 scanner_identifier_len = 0;
87
88 free (scanner_input);
89 scanner_input = NULL;
90 scan_pos = NULL;
91 have_peek = false;
92 }
93
94 enum TOKEN
scanner_peek_token()95 scanner_peek_token ()
96 {
97 if (have_peek)
98 return scan_peek;
99
100 scan_peek = scanner_get_token ();
101 have_peek = true;
102 return scan_peek;
103 }
104
105 enum TOKEN
scanner_get_token()106 scanner_get_token ()
107 {
108 char ident[MAX_IDENTIFIER_LENGTH];
109 char *pend;
110
111 if (have_peek)
112 {
113 have_peek = false;
114 return scan_peek;
115 }
116
117 assert (scan_pos != NULL); /* LCOV_EXCL_LINE */
118
119 if (*scan_pos == '\0')
120 return TOK_END;
121
122 /* White space */
123 if (c_isspace (*scan_pos))
124 {
125 while ( c_isspace (*scan_pos) )
126 ++scan_pos;
127 if (scanner_keep_whitespace)
128 {
129 if (*scan_pos == '\0')
130 return TOK_END;
131 return TOK_WHITESPACE;
132 }
133 }
134
135 /* special characters */
136 if (*scan_pos == ',')
137 {
138 ++scan_pos;
139 set_identifier (",", 1);
140 return TOK_COMMA;
141 }
142 if (*scan_pos == '-')
143 {
144 ++scan_pos;
145 set_identifier ("-", 1);
146 return TOK_DASH;
147 }
148 if (*scan_pos == ':')
149 {
150 ++scan_pos;
151 set_identifier (":", 1);
152 return TOK_COLONS;
153 }
154
155 /* Integer or floating-point value */
156 if (c_isdigit (*scan_pos))
157 {
158 enum TOKEN rc = TOK_INTEGER;
159 errno = 0;
160 scan_val_int = strtol (scan_pos, &pend, 10);
161
162 if (*pend == '.')
163 {
164 /* a floating-point value */
165 scan_val_float = strtold (scan_pos, &pend);
166 rc = TOK_FLOAT;
167 }
168 if ((c_isalpha (*pend) || *pend=='_') || (errno == ERANGE))
169 die (EXIT_FAILURE, 0, _("invalid numeric value '%s'"),
170 scan_pos);
171
172 set_identifier (scan_pos, pend-scan_pos);
173 scan_pos = pend;
174 return rc;
175 }
176
177 /* a valid identifier ( [a-z_][a-z0-9_]+ ),
178 also backslash-CHAR is accepted as part of the identifier,
179 to allow dash, colons, */
180 if (c_isalpha (*scan_pos) || *scan_pos == '_' || *scan_pos == '\\')
181 {
182 int len = 0;
183 char *v = scan_pos;
184 while (1)
185 {
186 if (c_isalpha (*v) || c_isdigit (*v) || *v=='_' )
187 {
188 // Accept charracter
189 }
190 else if (*v == '\\')
191 {
192 ++v;
193 if (!*v)
194 die (EXIT_FAILURE, 0, _("backslash at end of identifier"));
195
196 // Accept following character
197 }
198 else
199 break;
200
201 if (len >= (MAX_IDENTIFIER_LENGTH-1))
202 die (EXIT_FAILURE, 0, _("identifier name too long"));
203
204 ident[len++] = *v;
205 ++v;
206 }
207 ident[len] = '\0';
208
209 set_identifier (ident, len);
210 scan_pos = v;
211
212 return TOK_IDENTIFIER;
213 }
214
215 die (EXIT_FAILURE, 0, _("invalid operand %s"), quote (scan_pos));
216 return TOK_END;
217 }
218
219
220 #ifdef SCANNER_TEST_MAIN
221 /*
222 Trivial scanner tester.
223 To compile:
224 cc -D_STANDALONE_ -DSCANNER_TEST_MAIN \
225 -I. \
226 -std=c99 -Wall -Wextra -Werror -g -O0 \
227 -o dm-scanner ./src/op-scanner.c
228 Test:
229 ./dm-scanner groupby 1,2 sum 4-7
230 ./dm-scanner ppearson 1:6
231 ./dm-scanner foo bar 9.5f
232 */
233 #define TESTMAIN main
TESTMAIN(int argc,const char * argv[])234 int TESTMAIN (int argc, const char* argv[])
235 {
236 if (argc<2)
237 die (EXIT_FAILURE, 0, _("missing script (among arguments)"));
238
239 scanner_set_input_from_argv (argc-1, argv+1);
240
241 enum TOKEN tok;
242 while ( (tok = scanner_get_token ()) != TOK_END )
243 {
244 switch (tok)
245 {
246 case TOK_IDENTIFIER:
247 printf ("TOK_IDENTIFIER: '%s'\n", scanner_identifier);
248 break;
249
250 case TOK_INTEGER:
251 printf ("TOK_INTEGER: %lu ('%s')\n", scan_val_int, scanner_identifier);
252 break;
253
254 case TOK_FLOAT:
255 printf ("TOK_FLOAT: %Lf ('%s')\n", scan_val_float, scanner_identifier);
256 break;
257
258 case TOK_COMMA:
259 printf ("TOK_COMMA\n");
260 break;
261
262 case TOK_DASH:
263 printf ("TOK_DASH\n");
264 break;
265
266 case TOK_COLONS:
267 printf ("TOK_COLONS\n");
268 break;
269
270 default:
271 die (EXIT_FAILURE, 0 ,_("unknown token %d\n"),tok);
272 }
273 }
274
275 return 0;
276 }
277 #endif
278
279 /* vim: set cinoptions=>4,n-2,{2,^-2,:2,=2,g0,h2,p5,t0,+2,(0,u0,w1,m1: */
280 /* vim: set shiftwidth=2: */
281 /* vim: set tabstop=2: */
282 /* vim: set expandtab: */
283