1 /* vstr stuff and bug fixes by james antill ... LGPL and above MIT.
2    timesstamp() function by Michael B Allen <mba2000@ioplex.com>
3    gcc -g -Wall -W -O2 -o csv_vstr csv_vstr.c `pkg-config --cflags --libs vstr`
4  */
5 
6 /* configuration... */
7 #define VCSV_OPT_FLAGS VCSV_FLAG_LINE
8 #define USE_VDUMP 1
9 #define USE_DEBUG 0
10 
11 
12 #if !(USE_DEBUG)
13 # define NDEBUG 1 /* go backwards for traditional macro */
14 #endif
15 
16 #define VSTR_COMPILE_INCLUDE 1
17 #include <vstr.h>
18 #include <assert.h>
19 #include <string.h>
20 #include <unistd.h>
21 
22 /* option flags... */
23 #define VCSV_FLAG_NONE 0
24 #define VCSV_FLAG_LINE 1 /* only parse one line */
25 
26 
27 #define TRUE  1
28 #define FALSE 0
29 
30 
31 #define VCSV_ST_PRE           0
32 #define VCSV_ST_BEG           1
33 #define VCSV_ST_GET_BEG_DQUOT 2
34 #define VCSV_ST_GET_END_DQUOT 3
35 #define VCSV_ST_GET_NORM      4
36 #define VCSV_ST_SKIP_COMMA    5
37 #define VCSV_ST_SKIP_TRASH    6
38 #define VCSV_ST_SKIP_RET      7
39 #define VCSV_ST_INIT          8
40 
41 #define VCSV__MEMCHR(x) memchr(iter->ptr, (x), iter->len)
42 
43 #define VCSV__INC(x) do { size_t local_inc_tmp = (x); \
44     assert(local_inc_tmp <= iter->len); \
45     \
46     iter->ptr += local_inc_tmp; \
47     iter->len -= local_inc_tmp; \
48     \
49     len -= local_inc_tmp; \
50  } while (FALSE)
51 
52 static Vstr_base *out = NULL;
53 static Vstr_base *vcsv_data = NULL;
54 
vcsv_end(size_t pos,size_t data_len,size_t beg_len,size_t len,Vstr_sects * rows)55 static inline void vcsv_end(size_t pos, size_t data_len,
56                             size_t beg_len, size_t len,
57                             Vstr_sects *rows)
58 {
59   vstr_sects_add(rows, pos + (data_len - beg_len), beg_len - len);
60 }
61 
62 static int
vcsv_row_parse(Vstr_base * s1,size_t pos,size_t len,Vstr_sects * rows,unsigned int flags)63 vcsv_row_parse(Vstr_base *s1, size_t pos, size_t len,
64                Vstr_sects *rows, unsigned int flags)
65 {
66   size_t data_len = len;
67   size_t ret_len  = len;
68   size_t beg_len  = 0;
69   unsigned int state = VCSV_ST_INIT;
70   const char *ptr = NULL;
71   size_t tmp = 0;
72   Vstr_iter iter[1];
73 
74   if (!len)
75     return (0);
76 
77   if (!vstr_iter_fwd_beg(s1, pos, len, iter))
78     abort();
79 
80   while (len)
81   {
82     if (!iter->len)
83       if (!vstr_iter_fwd_nxt(iter)) abort();
84 
85     switch (state)
86     {
87       case VCSV_ST_SKIP_TRASH:
88         if (!(ptr = VCSV__MEMCHR(',')))
89         {
90           VCSV__INC(iter->len);
91           break;
92         }
93         tmp = ptr - iter->ptr;
94 
95         VCSV__INC(tmp);
96         state = VCSV_ST_SKIP_COMMA;
97         break;
98 
99       case VCSV_ST_SKIP_COMMA:
100         assert(*iter->ptr == ',');
101 
102         VCSV__INC(1);
103         state = VCSV_ST_PRE;
104         beg_len = len;
105         break;
106 
107       case VCSV_ST_PRE:
108         if ((*iter->ptr == '\n') || (*iter->ptr == '\r'))
109         {
110           vcsv_end(pos, data_len, beg_len, len, rows);
111           state = VCSV_ST_SKIP_RET;
112         }
113         else
114           state = VCSV_ST_BEG;
115         break;
116 
117       case VCSV_ST_SKIP_RET:
118         if (flags & VCSV_FLAG_LINE)
119           return (ret_len - len);
120       case VCSV_ST_INIT:
121 
122         while ((*iter->ptr == '\n') || (*iter->ptr == '\r'))
123         { /* skip blanks to start... */
124           VCSV__INC(1);
125 
126           if (!iter->len && !vstr_iter_fwd_nxt(iter))
127             return (ret_len);
128         }
129         beg_len = len;
130         /* FALL THROUGH */
131 
132       case VCSV_ST_BEG:
133         if (*iter->ptr == '"')
134         {
135           state = VCSV_ST_GET_BEG_DQUOT;
136           --beg_len;
137         }
138         else if (*iter->ptr == ',')
139         {
140           vcsv_end(pos, data_len, beg_len, len, rows);
141           state = VCSV_ST_SKIP_COMMA;
142           continue;
143         }
144         else
145         {
146           beg_len = len; /* mark start */
147           state = VCSV_ST_GET_NORM;
148         }
149         VCSV__INC(1);
150         break;
151 
152       case VCSV_ST_GET_BEG_DQUOT:
153         if (!(ptr = VCSV__MEMCHR('"')))
154         {
155           VCSV__INC(iter->len);
156           break;
157         }
158         tmp = ptr - iter->ptr;
159         VCSV__INC(tmp + 1);
160         state = VCSV_ST_GET_END_DQUOT;
161         if (!len)
162         {
163           vcsv_end(pos, data_len, beg_len, 1, rows);
164           return (ret_len);
165         }
166         break;
167 
168       case VCSV_ST_GET_END_DQUOT:
169       {
170         unsigned int found_ret = FALSE;
171 
172         ++len; /* go back to the '"' */
173         switch (*iter->ptr)
174         {
175           case '\r':
176           case '\n':
177             found_ret = TRUE;
178           case ',':
179             vcsv_end(pos, data_len, beg_len, len, rows);
180             beg_len = 0;
181             if (found_ret)
182               state = VCSV_ST_SKIP_RET;
183             else
184               state = VCSV_ST_SKIP_COMMA;
185             break;
186 
187           default:
188             vcsv_end(pos, data_len, beg_len, len, rows);
189             beg_len = 0;
190             state = VCSV_ST_SKIP_TRASH;
191             break;
192 
193           case '"':
194           {
195             size_t tpos = pos + (data_len - beg_len) + (beg_len - len);
196 
197             vstr_del(s1, tpos, 1);
198 
199             --data_len; /* update lengths and re-init iter */
200             --beg_len;
201             len -= 2;   /* for above */
202 
203             if (!vstr_iter_fwd_beg(s1, tpos + 1, len, iter)) abort();
204             state = VCSV_ST_GET_BEG_DQUOT;
205             continue;
206           }
207           break;
208         }
209         --len; /* reverse above */
210       }
211       break;
212 
213       case VCSV_ST_GET_NORM:
214         tmp = 0;
215         while (tmp < iter->len)
216         {
217           if ((iter->ptr[tmp] == ',') ||
218               (iter->ptr[tmp] == '\r') ||
219               (iter->ptr[tmp] == '\n'))
220             break;
221 
222           ++tmp;
223         }
224         VCSV__INC(tmp);
225         if (!iter->len)
226           break;
227 
228         vcsv_end(pos, data_len, beg_len, len, rows);
229         if (iter->ptr[0] == ',')
230           state = VCSV_ST_SKIP_COMMA;
231         else
232           state = VCSV_ST_SKIP_RET;
233         break;
234 
235       default:
236         abort();
237     }
238   }
239 
240   if ((state != VCSV_ST_SKIP_RET) && (state != VCSV_ST_SKIP_TRASH))
241     vcsv_end(pos, data_len, beg_len, len, rows);
242 
243   return (ret_len);
244 }
245 
246 #if USE_VDUMP
247 # define VDUMP(ret, rows) vdump(ret, rows)
vdump(unsigned int ret,Vstr_sects * rows)248 static void vdump(unsigned int ret, Vstr_sects *rows)
249 {
250   unsigned int scan = 0;
251 
252   vstr_add_fmt(out, out->len,
253                "${rep_chr:%c%zu}\n" "%d\n" "${rep_chr:%c%zu}\n",
254                '=', 79, ret, '=', 79);
255 
256   while (scan < rows->num)
257   {
258     size_t pos = 1;
259     size_t len = 0;
260 
261     ++scan;
262 
263     len = VSTR_SECTS_NUM(rows, scan)->len;
264     if (len)
265       pos = VSTR_SECTS_NUM(rows, scan)->pos;
266 
267     vstr_add_fmt(out, out->len, "|${vstr:%p%zu%zu%u}|\n",
268                  vcsv_data, pos, len, 0);
269   }
270 
271   vstr_add_fmt(out, out->len, "${rep_chr:%c%zu}\n", '-', 79);
272 
273   while (out->len)
274     if (!vstr_sc_write_fd(out, 1, out->len, 1, NULL))
275       abort();
276 }
277 #else
278 # define VDUMP(ret, rows) /* nothing */
279 #endif
280 
281 #if defined(_WIN32)
282 #include <Windows.h>
283 
284 #define MILLISECONDS_BETWEEN_1970_AND_1601 11644473600000Ui64
285 typedef unsigned __int64 uint64_t;
286 
287 uint64_t
timestamp(void)288 timestamp(void)
289 {
290         FILETIME ftime;
291         uint64_t ret;
292 
293         GetSystemTimeAsFileTime(&ftime);
294 
295         ret = ftime.dwHighDateTime;
296         ret <<= 32Ui64;
297         ret |= ftime.dwLowDateTime;
298         ret = ret / 10000Ui64 - MILLISECONDS_BETWEEN_1970_AND_1601;
299 
300         return ret;
301 }
302 
303 #else
304 #include <sys/time.h>
305 #include <inttypes.h>
306 
307 
308 static inline uint64_t
timestamp(void)309 timestamp(void)
310 {
311         struct timeval tval;
312 
313         gettimeofday(&tval, NULL);
314 
315         return tval.tv_sec * 1000LL + tval.tv_usec / 1000;
316 }
317 
318 #endif
319 
main(int argc,char * argv[])320 int main(int argc, char *argv[])
321 {
322   VSTR_SECTS_DECL(vrows, 256);
323 
324     int ret = 0;
325 
326   if (argc != 2) abort();
327 
328   if (!vstr_init())
329     abort();
330 
331   VSTR_SECTS_DECL_INIT(vrows);
332 
333   vcsv_data = vstr_make_base(NULL);
334 
335   out       = vstr_make_base(NULL);
336   vstr_cntl_conf(NULL, VSTR_CNTL_CONF_SET_FMT_CHAR_ESC, '$');
337   vstr_sc_fmt_add_all(NULL);
338 
339   do
340   {
341     size_t pos = 1;
342     size_t len = 0;
343     unsigned int err = 0;
344     uint64_t t0 = timestamp();
345 
346     if (!vstr_sc_mmap_file(vcsv_data, 0, argv[1], 0, 0, &err))
347       break;
348 
349     len = vcsv_data->len;
350     if (err)
351       abort();
352 
353     while ((ret = vcsv_row_parse(vcsv_data, pos, len, vrows, VCSV_OPT_FLAGS)))
354     {
355       VDUMP(ret, vrows);
356 
357       pos += ret;
358       len -= ret;
359       vrows->num = 0;
360     }
361 
362     vstr_add_fmt(out, out->len, "%'llu milliseconds\n", (timestamp() - t0));
363 
364     while (out->len)
365       if (!vstr_sc_write_fd(out, 1, out->len, 2, NULL))
366         abort();
367   } while (FALSE);
368 
369   exit (EXIT_SUCCESS);
370 }
371