1 /* GNU Datamash - perform simple calculation on input data
2 
3    Copyright (C) 2013-2020 Assaf Gordon <assafgordon@gmail.com>
4 
5    This file is part of GNU Datamash.
6 
7    GNU Datamash is free software: you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation, either version 3 of the License, or
10    (at your option) any later version.
11 
12    GNU Datamash is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with GNU Datamash.  If not, see <https://www.gnu.org/licenses/>.
19 */
20 
21 /* Written by Assaf Gordon */
22 #include <config.h>
23 #include <string.h>
24 #include <inttypes.h>
25 #include <stdlib.h>
26 #include <stdbool.h>
27 #include <stdio.h>
28 #include <ctype.h>
29 #include <assert.h>
30 #include <errno.h>
31 
32 #include "system.h"
33 
34 #include "die.h"
35 #include "op-scanner.h"
36 
37 /* Used by other modules */
38 uintmax_t scan_val_int;
39 long double scan_val_float;
40 char* scanner_identifier;
41 bool scanner_keep_whitespace = false;
42 
43 /* Internal */
44 static char* scanner_input;
45 static char* scan_pos;
46 static size_t scanner_identifier_len;
47 static bool have_peek;
48 static enum TOKEN scan_peek;
49 
50 static inline void
set_identifier(const char * data,size_t n)51 set_identifier (const char* data, size_t n)
52 {
53   if (n>=scanner_identifier_len)
54     {
55       scanner_identifier = xrealloc (scanner_identifier,n+1);
56       scanner_identifier_len = n+1;
57     }
58   memcpy (scanner_identifier, data, n);
59   scanner_identifier[n]=0;
60 }
61 
62 /* Concatante argv into one (space separated) string */
63 void
scanner_set_input_from_argv(int argc,const char * argv[])64 scanner_set_input_from_argv (int argc, const char* argv[])
65 {
66   assert (scanner_input == NULL);                /* LCOV_EXCL_LINE */
67 
68   size_t len = 1; /* +1 for NUL */
69   for (int i=0;i<argc;++i)
70     len += strlen (argv[i])+1; /* +1 for space */
71 
72   char *p = scan_pos = scanner_input = XCALLOC (len, char);
73   for (int i=0; i<argc; ++i)
74   {
75       if (i>0)
76         p = stpcpy (p, " ");
77       p = stpcpy (p, argv[i]);
78   }
79 }
80 
81 void
scanner_free()82 scanner_free ()
83 {
84   free (scanner_identifier);
85   scanner_identifier = NULL;
86   scanner_identifier_len = 0;
87 
88   free (scanner_input);
89   scanner_input = NULL;
90   scan_pos = NULL;
91   have_peek = false;
92 }
93 
94 enum TOKEN
scanner_peek_token()95 scanner_peek_token ()
96 {
97   if (have_peek)
98     return scan_peek;
99 
100   scan_peek = scanner_get_token ();
101   have_peek = true;
102   return scan_peek;
103 }
104 
105 enum TOKEN
scanner_get_token()106 scanner_get_token ()
107 {
108   char ident[MAX_IDENTIFIER_LENGTH];
109   char *pend;
110 
111   if (have_peek)
112     {
113       have_peek = false;
114       return scan_peek;
115     }
116 
117   assert (scan_pos != NULL);                      /* LCOV_EXCL_LINE */
118 
119   if (*scan_pos == '\0')
120     return TOK_END;
121 
122   /* White space */
123   if (c_isspace (*scan_pos))
124     {
125       while ( c_isspace (*scan_pos) )
126         ++scan_pos;
127       if (scanner_keep_whitespace)
128         {
129           if (*scan_pos == '\0')
130             return TOK_END;
131           return TOK_WHITESPACE;
132         }
133     }
134 
135   /* special characters */
136   if (*scan_pos == ',')
137     {
138       ++scan_pos;
139       set_identifier (",", 1);
140       return TOK_COMMA;
141     }
142   if (*scan_pos == '-')
143     {
144       ++scan_pos;
145       set_identifier ("-", 1);
146       return TOK_DASH;
147     }
148   if (*scan_pos == ':')
149     {
150       ++scan_pos;
151       set_identifier (":", 1);
152       return TOK_COLONS;
153     }
154 
155   /* Integer or floating-point value */
156   if (c_isdigit (*scan_pos))
157     {
158       enum TOKEN rc = TOK_INTEGER;
159       errno = 0;
160       scan_val_int = strtol (scan_pos, &pend, 10);
161 
162       if (*pend == '.')
163         {
164           /* a floating-point value */
165           scan_val_float = strtold (scan_pos, &pend);
166           rc = TOK_FLOAT;
167         }
168       if ((c_isalpha (*pend) || *pend=='_') || (errno == ERANGE))
169         die (EXIT_FAILURE, 0, _("invalid numeric value '%s'"),
170                                 scan_pos);
171 
172       set_identifier (scan_pos, pend-scan_pos);
173       scan_pos = pend;
174       return rc;
175     }
176 
177   /* a valid identifier ( [a-z_][a-z0-9_]+ ),
178      also backslash-CHAR is accepted as part of the identifier,
179      to allow dash, colons, */
180   if (c_isalpha (*scan_pos) || *scan_pos == '_' || *scan_pos == '\\')
181     {
182       int len = 0;
183       char *v = scan_pos;
184       while (1)
185         {
186           if (c_isalpha (*v) || c_isdigit (*v) || *v=='_' )
187             {
188               // Accept charracter
189             }
190           else if (*v == '\\')
191             {
192               ++v;
193               if (!*v)
194                 die (EXIT_FAILURE, 0, _("backslash at end of identifier"));
195 
196               // Accept following character
197             }
198           else
199             break;
200 
201           if (len >= (MAX_IDENTIFIER_LENGTH-1))
202             die (EXIT_FAILURE, 0, _("identifier name too long"));
203 
204           ident[len++] = *v;
205           ++v;
206         }
207       ident[len] = '\0';
208 
209       set_identifier (ident, len);
210       scan_pos = v;
211 
212       return TOK_IDENTIFIER;
213     }
214 
215   die (EXIT_FAILURE, 0, _("invalid operand %s"), quote (scan_pos));
216   return TOK_END;
217 }
218 
219 
220 #ifdef SCANNER_TEST_MAIN
221 /*
222  Trivial scanner tester.
223  To compile:
224     cc -D_STANDALONE_ -DSCANNER_TEST_MAIN \
225        -I. \
226        -std=c99 -Wall -Wextra -Werror -g -O0 \
227        -o dm-scanner ./src/op-scanner.c
228  Test:
229     ./dm-scanner groupby 1,2 sum 4-7
230     ./dm-scanner ppearson 1:6
231     ./dm-scanner foo bar 9.5f
232 */
233 #define TESTMAIN main
TESTMAIN(int argc,const char * argv[])234 int TESTMAIN (int argc, const char* argv[])
235 {
236   if (argc<2)
237     die (EXIT_FAILURE, 0, _("missing script (among arguments)"));
238 
239   scanner_set_input_from_argv (argc-1, argv+1);
240 
241   enum TOKEN tok;
242   while ( (tok = scanner_get_token ()) != TOK_END )
243   {
244     switch (tok)
245     {
246     case TOK_IDENTIFIER:
247       printf ("TOK_IDENTIFIER: '%s'\n", scanner_identifier);
248       break;
249 
250     case TOK_INTEGER:
251       printf ("TOK_INTEGER: %lu ('%s')\n", scan_val_int, scanner_identifier);
252       break;
253 
254     case TOK_FLOAT:
255       printf ("TOK_FLOAT: %Lf ('%s')\n", scan_val_float, scanner_identifier);
256       break;
257 
258     case TOK_COMMA:
259       printf ("TOK_COMMA\n");
260       break;
261 
262     case TOK_DASH:
263       printf ("TOK_DASH\n");
264       break;
265 
266     case TOK_COLONS:
267       printf ("TOK_COLONS\n");
268       break;
269 
270     default:
271       die (EXIT_FAILURE, 0 ,_("unknown token %d\n"),tok);
272     }
273   }
274 
275   return 0;
276 }
277 #endif
278 
279 /* vim: set cinoptions=>4,n-2,{2,^-2,:2,=2,g0,h2,p5,t0,+2,(0,u0,w1,m1: */
280 /* vim: set shiftwidth=2: */
281 /* vim: set tabstop=2: */
282 /* vim: set expandtab: */
283