1 /* vstr stuff and bug fixes by james antill ... LGPL and above MIT.
2 timesstamp() function by Michael B Allen <mba2000@ioplex.com>
3 gcc -g -Wall -W -O2 -o csv_vstr csv_vstr.c `pkg-config --cflags --libs vstr`
4 */
5
6 /* configuration... */
7 #define VCSV_OPT_FLAGS VCSV_FLAG_LINE
8 #define USE_VDUMP 1
9 #define USE_DEBUG 0
10
11
12 #if !(USE_DEBUG)
13 # define NDEBUG 1 /* go backwards for traditional macro */
14 #endif
15
16 #define VSTR_COMPILE_INCLUDE 1
17 #include <vstr.h>
18 #include <assert.h>
19 #include <string.h>
20 #include <unistd.h>
21
22 /* option flags... */
23 #define VCSV_FLAG_NONE 0
24 #define VCSV_FLAG_LINE 1 /* only parse one line */
25
26
27 #define TRUE 1
28 #define FALSE 0
29
30
31 #define VCSV_ST_PRE 0
32 #define VCSV_ST_BEG 1
33 #define VCSV_ST_GET_BEG_DQUOT 2
34 #define VCSV_ST_GET_END_DQUOT 3
35 #define VCSV_ST_GET_NORM 4
36 #define VCSV_ST_SKIP_COMMA 5
37 #define VCSV_ST_SKIP_TRASH 6
38 #define VCSV_ST_SKIP_RET 7
39 #define VCSV_ST_INIT 8
40
41 #define VCSV__MEMCHR(x) memchr(iter->ptr, (x), iter->len)
42
43 #define VCSV__INC(x) do { size_t local_inc_tmp = (x); \
44 assert(local_inc_tmp <= iter->len); \
45 \
46 iter->ptr += local_inc_tmp; \
47 iter->len -= local_inc_tmp; \
48 \
49 len -= local_inc_tmp; \
50 } while (FALSE)
51
52 static Vstr_base *out = NULL;
53 static Vstr_base *vcsv_data = NULL;
54
vcsv_end(size_t pos,size_t data_len,size_t beg_len,size_t len,Vstr_sects * rows)55 static inline void vcsv_end(size_t pos, size_t data_len,
56 size_t beg_len, size_t len,
57 Vstr_sects *rows)
58 {
59 vstr_sects_add(rows, pos + (data_len - beg_len), beg_len - len);
60 }
61
62 static int
vcsv_row_parse(Vstr_base * s1,size_t pos,size_t len,Vstr_sects * rows,unsigned int flags)63 vcsv_row_parse(Vstr_base *s1, size_t pos, size_t len,
64 Vstr_sects *rows, unsigned int flags)
65 {
66 size_t data_len = len;
67 size_t ret_len = len;
68 size_t beg_len = 0;
69 unsigned int state = VCSV_ST_INIT;
70 const char *ptr = NULL;
71 size_t tmp = 0;
72 Vstr_iter iter[1];
73
74 if (!len)
75 return (0);
76
77 if (!vstr_iter_fwd_beg(s1, pos, len, iter))
78 abort();
79
80 while (len)
81 {
82 if (!iter->len)
83 if (!vstr_iter_fwd_nxt(iter)) abort();
84
85 switch (state)
86 {
87 case VCSV_ST_SKIP_TRASH:
88 if (!(ptr = VCSV__MEMCHR(',')))
89 {
90 VCSV__INC(iter->len);
91 break;
92 }
93 tmp = ptr - iter->ptr;
94
95 VCSV__INC(tmp);
96 state = VCSV_ST_SKIP_COMMA;
97 break;
98
99 case VCSV_ST_SKIP_COMMA:
100 assert(*iter->ptr == ',');
101
102 VCSV__INC(1);
103 state = VCSV_ST_PRE;
104 beg_len = len;
105 break;
106
107 case VCSV_ST_PRE:
108 if ((*iter->ptr == '\n') || (*iter->ptr == '\r'))
109 {
110 vcsv_end(pos, data_len, beg_len, len, rows);
111 state = VCSV_ST_SKIP_RET;
112 }
113 else
114 state = VCSV_ST_BEG;
115 break;
116
117 case VCSV_ST_SKIP_RET:
118 if (flags & VCSV_FLAG_LINE)
119 return (ret_len - len);
120 case VCSV_ST_INIT:
121
122 while ((*iter->ptr == '\n') || (*iter->ptr == '\r'))
123 { /* skip blanks to start... */
124 VCSV__INC(1);
125
126 if (!iter->len && !vstr_iter_fwd_nxt(iter))
127 return (ret_len);
128 }
129 beg_len = len;
130 /* FALL THROUGH */
131
132 case VCSV_ST_BEG:
133 if (*iter->ptr == '"')
134 {
135 state = VCSV_ST_GET_BEG_DQUOT;
136 --beg_len;
137 }
138 else if (*iter->ptr == ',')
139 {
140 vcsv_end(pos, data_len, beg_len, len, rows);
141 state = VCSV_ST_SKIP_COMMA;
142 continue;
143 }
144 else
145 {
146 beg_len = len; /* mark start */
147 state = VCSV_ST_GET_NORM;
148 }
149 VCSV__INC(1);
150 break;
151
152 case VCSV_ST_GET_BEG_DQUOT:
153 if (!(ptr = VCSV__MEMCHR('"')))
154 {
155 VCSV__INC(iter->len);
156 break;
157 }
158 tmp = ptr - iter->ptr;
159 VCSV__INC(tmp + 1);
160 state = VCSV_ST_GET_END_DQUOT;
161 if (!len)
162 {
163 vcsv_end(pos, data_len, beg_len, 1, rows);
164 return (ret_len);
165 }
166 break;
167
168 case VCSV_ST_GET_END_DQUOT:
169 {
170 unsigned int found_ret = FALSE;
171
172 ++len; /* go back to the '"' */
173 switch (*iter->ptr)
174 {
175 case '\r':
176 case '\n':
177 found_ret = TRUE;
178 case ',':
179 vcsv_end(pos, data_len, beg_len, len, rows);
180 beg_len = 0;
181 if (found_ret)
182 state = VCSV_ST_SKIP_RET;
183 else
184 state = VCSV_ST_SKIP_COMMA;
185 break;
186
187 default:
188 vcsv_end(pos, data_len, beg_len, len, rows);
189 beg_len = 0;
190 state = VCSV_ST_SKIP_TRASH;
191 break;
192
193 case '"':
194 {
195 size_t tpos = pos + (data_len - beg_len) + (beg_len - len);
196
197 vstr_del(s1, tpos, 1);
198
199 --data_len; /* update lengths and re-init iter */
200 --beg_len;
201 len -= 2; /* for above */
202
203 if (!vstr_iter_fwd_beg(s1, tpos + 1, len, iter)) abort();
204 state = VCSV_ST_GET_BEG_DQUOT;
205 continue;
206 }
207 break;
208 }
209 --len; /* reverse above */
210 }
211 break;
212
213 case VCSV_ST_GET_NORM:
214 tmp = 0;
215 while (tmp < iter->len)
216 {
217 if ((iter->ptr[tmp] == ',') ||
218 (iter->ptr[tmp] == '\r') ||
219 (iter->ptr[tmp] == '\n'))
220 break;
221
222 ++tmp;
223 }
224 VCSV__INC(tmp);
225 if (!iter->len)
226 break;
227
228 vcsv_end(pos, data_len, beg_len, len, rows);
229 if (iter->ptr[0] == ',')
230 state = VCSV_ST_SKIP_COMMA;
231 else
232 state = VCSV_ST_SKIP_RET;
233 break;
234
235 default:
236 abort();
237 }
238 }
239
240 if ((state != VCSV_ST_SKIP_RET) && (state != VCSV_ST_SKIP_TRASH))
241 vcsv_end(pos, data_len, beg_len, len, rows);
242
243 return (ret_len);
244 }
245
246 #if USE_VDUMP
247 # define VDUMP(ret, rows) vdump(ret, rows)
vdump(unsigned int ret,Vstr_sects * rows)248 static void vdump(unsigned int ret, Vstr_sects *rows)
249 {
250 unsigned int scan = 0;
251
252 vstr_add_fmt(out, out->len,
253 "${rep_chr:%c%zu}\n" "%d\n" "${rep_chr:%c%zu}\n",
254 '=', 79, ret, '=', 79);
255
256 while (scan < rows->num)
257 {
258 size_t pos = 1;
259 size_t len = 0;
260
261 ++scan;
262
263 len = VSTR_SECTS_NUM(rows, scan)->len;
264 if (len)
265 pos = VSTR_SECTS_NUM(rows, scan)->pos;
266
267 vstr_add_fmt(out, out->len, "|${vstr:%p%zu%zu%u}|\n",
268 vcsv_data, pos, len, 0);
269 }
270
271 vstr_add_fmt(out, out->len, "${rep_chr:%c%zu}\n", '-', 79);
272
273 while (out->len)
274 if (!vstr_sc_write_fd(out, 1, out->len, 1, NULL))
275 abort();
276 }
277 #else
278 # define VDUMP(ret, rows) /* nothing */
279 #endif
280
281 #if defined(_WIN32)
282 #include <Windows.h>
283
284 #define MILLISECONDS_BETWEEN_1970_AND_1601 11644473600000Ui64
285 typedef unsigned __int64 uint64_t;
286
287 uint64_t
timestamp(void)288 timestamp(void)
289 {
290 FILETIME ftime;
291 uint64_t ret;
292
293 GetSystemTimeAsFileTime(&ftime);
294
295 ret = ftime.dwHighDateTime;
296 ret <<= 32Ui64;
297 ret |= ftime.dwLowDateTime;
298 ret = ret / 10000Ui64 - MILLISECONDS_BETWEEN_1970_AND_1601;
299
300 return ret;
301 }
302
303 #else
304 #include <sys/time.h>
305 #include <inttypes.h>
306
307
308 static inline uint64_t
timestamp(void)309 timestamp(void)
310 {
311 struct timeval tval;
312
313 gettimeofday(&tval, NULL);
314
315 return tval.tv_sec * 1000LL + tval.tv_usec / 1000;
316 }
317
318 #endif
319
main(int argc,char * argv[])320 int main(int argc, char *argv[])
321 {
322 VSTR_SECTS_DECL(vrows, 256);
323
324 int ret = 0;
325
326 if (argc != 2) abort();
327
328 if (!vstr_init())
329 abort();
330
331 VSTR_SECTS_DECL_INIT(vrows);
332
333 vcsv_data = vstr_make_base(NULL);
334
335 out = vstr_make_base(NULL);
336 vstr_cntl_conf(NULL, VSTR_CNTL_CONF_SET_FMT_CHAR_ESC, '$');
337 vstr_sc_fmt_add_all(NULL);
338
339 do
340 {
341 size_t pos = 1;
342 size_t len = 0;
343 unsigned int err = 0;
344 uint64_t t0 = timestamp();
345
346 if (!vstr_sc_mmap_file(vcsv_data, 0, argv[1], 0, 0, &err))
347 break;
348
349 len = vcsv_data->len;
350 if (err)
351 abort();
352
353 while ((ret = vcsv_row_parse(vcsv_data, pos, len, vrows, VCSV_OPT_FLAGS)))
354 {
355 VDUMP(ret, vrows);
356
357 pos += ret;
358 len -= ret;
359 vrows->num = 0;
360 }
361
362 vstr_add_fmt(out, out->len, "%'llu milliseconds\n", (timestamp() - t0));
363
364 while (out->len)
365 if (!vstr_sc_write_fd(out, 1, out->len, 2, NULL))
366 abort();
367 } while (FALSE);
368
369 exit (EXIT_SUCCESS);
370 }
371