1 /*
2 * Copyright (c) 2002-2015 Balabit
3 * Copyright (c) 1998-2015 Balázs Scheidler
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2.1 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * As an additional exemption you are allowed to compile & link against the
20 * OpenSSL libraries as published by the OpenSSL project. See the file
21 * COPYING for details.
22 *
23 */
24 #include "csv-scanner.h"
25 #include "str-utils.h"
26 #include "string-list.h"
27 #include "scratch-buffers.h"
28
29 #include <string.h>
30
31 /************************************************************************
32 * CSVScannerOptions
33 ************************************************************************/
34
35 void
csv_scanner_options_set_flags(CSVScannerOptions * options,guint32 flags)36 csv_scanner_options_set_flags(CSVScannerOptions *options, guint32 flags)
37 {
38 options->flags = flags;
39 }
40
41 void
csv_scanner_options_set_dialect(CSVScannerOptions * options,CSVScannerDialect dialect)42 csv_scanner_options_set_dialect(CSVScannerOptions *options, CSVScannerDialect dialect)
43 {
44 options->dialect = dialect;
45 }
46
47 void
csv_scanner_options_set_columns(CSVScannerOptions * options,GList * columns)48 csv_scanner_options_set_columns(CSVScannerOptions *options, GList *columns)
49 {
50 string_list_free(options->columns);
51 options->columns = columns;
52 }
53
54 void
csv_scanner_options_set_delimiters(CSVScannerOptions * options,const gchar * delimiters)55 csv_scanner_options_set_delimiters(CSVScannerOptions *options, const gchar *delimiters)
56 {
57 g_free(options->delimiters);
58 options->delimiters = g_strdup(delimiters);
59 }
60
61 void
csv_scanner_options_set_string_delimiters(CSVScannerOptions * options,GList * string_delimiters)62 csv_scanner_options_set_string_delimiters(CSVScannerOptions *options, GList *string_delimiters)
63 {
64 string_list_free(options->string_delimiters);
65 options->string_delimiters = string_delimiters;
66 }
67
68 void
csv_scanner_options_set_quotes_start_and_end(CSVScannerOptions * options,const gchar * quotes_start,const gchar * quotes_end)69 csv_scanner_options_set_quotes_start_and_end(CSVScannerOptions *options, const gchar *quotes_start,
70 const gchar *quotes_end)
71 {
72 g_free(options->quotes_start);
73 g_free(options->quotes_end);
74 options->quotes_start = g_strdup(quotes_start);
75 options->quotes_end = g_strdup(quotes_end);
76 }
77
78 void
csv_scanner_options_set_quotes(CSVScannerOptions * options,const gchar * quotes)79 csv_scanner_options_set_quotes(CSVScannerOptions *options, const gchar *quotes)
80 {
81 csv_scanner_options_set_quotes_start_and_end(options, quotes, quotes);
82 }
83
84 void
csv_scanner_options_set_quote_pairs(CSVScannerOptions * options,const gchar * quote_pairs)85 csv_scanner_options_set_quote_pairs(CSVScannerOptions *options, const gchar *quote_pairs)
86 {
87 gint i;
88
89 g_free(options->quotes_start);
90 g_free(options->quotes_end);
91
92 options->quotes_start = g_malloc((strlen(quote_pairs) / 2) + 1);
93 options->quotes_end = g_malloc((strlen(quote_pairs) / 2) + 1);
94
95 for (i = 0; quote_pairs[i] && quote_pairs[i+1]; i += 2)
96 {
97 options->quotes_start[i / 2] = quote_pairs[i];
98 options->quotes_end[i / 2] = quote_pairs[i + 1];
99 }
100 options->quotes_start[i / 2] = 0;
101 options->quotes_end[i / 2] = 0;
102 }
103
104
105 void
csv_scanner_options_set_null_value(CSVScannerOptions * options,const gchar * null_value)106 csv_scanner_options_set_null_value(CSVScannerOptions *options, const gchar *null_value)
107 {
108 g_free(options->null_value);
109 options->null_value = null_value && null_value[0] ? g_strdup(null_value) : NULL;
110 }
111
112 void
csv_scanner_options_copy(CSVScannerOptions * dst,CSVScannerOptions * src)113 csv_scanner_options_copy(CSVScannerOptions *dst, CSVScannerOptions *src)
114 {
115 csv_scanner_options_set_delimiters(dst, src->delimiters);
116 csv_scanner_options_set_quotes_start_and_end(dst, src->quotes_start, src->quotes_end);
117 csv_scanner_options_set_null_value(dst, src->null_value);
118 csv_scanner_options_set_string_delimiters(dst, string_list_clone(src->string_delimiters));
119 csv_scanner_options_set_columns(dst, string_list_clone(src->columns));
120 dst->dialect = src->dialect;
121 dst->flags = src->flags;
122 }
123
124 void
csv_scanner_options_clean(CSVScannerOptions * options)125 csv_scanner_options_clean(CSVScannerOptions *options)
126 {
127 g_free(options->quotes_start);
128 g_free(options->quotes_end);
129 g_free(options->null_value);
130 g_free(options->delimiters);
131 string_list_free(options->string_delimiters);
132 string_list_free(options->columns);
133 }
134
135 /************************************************************************
136 * CSVScanner
137 ************************************************************************/
138
139 static gboolean
_is_whitespace_char(const gchar * str)140 _is_whitespace_char(const gchar *str)
141 {
142 return (*str == ' ' || *str == '\t');
143 }
144
145 static void
_skip_whitespace(const gchar ** src)146 _skip_whitespace(const gchar **src)
147 {
148 while (_is_whitespace_char(*src))
149 (*src)++;
150 }
151
152 static void
_parse_opening_quote_character(CSVScanner * self)153 _parse_opening_quote_character(CSVScanner *self)
154 {
155 gchar *quote = _strchr_optimized_for_single_char_haystack(self->options->quotes_start, *self->src);
156
157 if (quote != NULL)
158 {
159 /* ok, quote character found */
160 self->src++;
161 self->current_quote = self->options->quotes_end[quote - self->options->quotes_start];
162 }
163 else
164 {
165 /* we didn't start with a quote character, no need for escaping, delimiter terminates */
166 self->current_quote = 0;
167 }
168 }
169
170 static void
_parse_left_whitespace(CSVScanner * self)171 _parse_left_whitespace(CSVScanner *self)
172 {
173 if ((self->options->flags & CSV_SCANNER_STRIP_WHITESPACE) == 0)
174 return;
175
176 _skip_whitespace(&self->src);
177 }
178
179 static void
_parse_character_with_quotation(CSVScanner * self)180 _parse_character_with_quotation(CSVScanner *self)
181 {
182 /* quoted character */
183 if (self->options->dialect == CSV_SCANNER_ESCAPE_BACKSLASH &&
184 *self->src == '\\' &&
185 *(self->src + 1))
186 {
187 self->src++;
188 }
189 else if (self->options->dialect == CSV_SCANNER_ESCAPE_DOUBLE_CHAR &&
190 *self->src == self->current_quote &&
191 *(self->src+1) == self->current_quote)
192 {
193 self->src++;
194 }
195 else if (*self->src == self->current_quote)
196 {
197 self->current_quote = 0;
198 self->src++;
199 return;
200 }
201 g_string_append_c(self->current_value, *self->src);
202 self->src++;
203 }
204
205 /* searches for str in list and returns the first occurrence, otherwise NULL */
206 static gboolean
_match_string_delimiters_at_current_position(const char * input,GList * string_delimiters,int * result_length)207 _match_string_delimiters_at_current_position(const char *input, GList *string_delimiters, int *result_length)
208 {
209 GList *l;
210
211 for (l = string_delimiters; l; l = l->next)
212 {
213 gint len = strlen(l->data);
214
215 if (strncmp(input, l->data, len) == 0)
216 {
217 *result_length = len;
218 return TRUE;
219 }
220 }
221 return FALSE;
222 }
223
224 static gboolean
_parse_string_delimiters_at_current_position(CSVScanner * self)225 _parse_string_delimiters_at_current_position(CSVScanner *self)
226 {
227 gint delim_len;
228
229 if (!self->options->string_delimiters)
230 return FALSE;
231
232 if (_match_string_delimiters_at_current_position(self->src,
233 self->options->string_delimiters,
234 &delim_len))
235 {
236 self->src += delim_len;
237 return TRUE;
238 }
239 return FALSE;
240 }
241
242 static gboolean
_parse_character_delimiters_at_current_position(CSVScanner * self)243 _parse_character_delimiters_at_current_position(CSVScanner *self)
244 {
245 if (_strchr_optimized_for_single_char_haystack(self->options->delimiters, *self->src) != NULL)
246 {
247 self->src++;
248 return TRUE;
249 }
250 return FALSE;
251 }
252
253 static gboolean
_parse_delimiter(CSVScanner * self)254 _parse_delimiter(CSVScanner *self)
255 {
256 if (_parse_string_delimiters_at_current_position(self))
257 return TRUE;
258
259 if (_parse_character_delimiters_at_current_position(self))
260 return TRUE;
261
262 return FALSE;
263 }
264
265 static void
_parse_unquoted_literal_character(CSVScanner * self)266 _parse_unquoted_literal_character(CSVScanner *self)
267 {
268 g_string_append_c(self->current_value, *self->src);
269 self->src++;
270 }
271
272 static void
_parse_value_with_whitespace_and_delimiter(CSVScanner * self)273 _parse_value_with_whitespace_and_delimiter(CSVScanner *self)
274 {
275 while (*self->src)
276 {
277 if (self->current_quote)
278 {
279 /* within quotation marks */
280 _parse_character_with_quotation(self);
281 }
282 else
283 {
284 /* unquoted value */
285 if (_parse_delimiter(self))
286 break;
287 _parse_unquoted_literal_character(self);
288 }
289 }
290 }
291
292 static gint
_get_value_length_without_right_whitespace(CSVScanner * self)293 _get_value_length_without_right_whitespace(CSVScanner *self)
294 {
295 gint len = self->current_value->len;
296
297 while (len > 0 && _is_whitespace_char(self->current_value->str + len - 1))
298 len--;
299
300 return len;
301 }
302
303 static void
_translate_rstrip_whitespace(CSVScanner * self)304 _translate_rstrip_whitespace(CSVScanner *self)
305 {
306 if (self->options->flags & CSV_SCANNER_STRIP_WHITESPACE)
307 g_string_truncate(self->current_value, _get_value_length_without_right_whitespace(self));
308 }
309
310 static void
_translate_null_value(CSVScanner * self)311 _translate_null_value(CSVScanner *self)
312 {
313 if (self->options->null_value &&
314 strcmp(self->current_value->str, self->options->null_value) == 0)
315 g_string_truncate(self->current_value, 0);
316 }
317
318 static void
_translate_value(CSVScanner * self)319 _translate_value(CSVScanner *self)
320 {
321 _translate_rstrip_whitespace(self);
322 _translate_null_value(self);
323 }
324
325 static gboolean
_is_last_column(CSVScanner * self)326 _is_last_column(CSVScanner *self)
327 {
328 return self->current_column && self->current_column->next == NULL;
329 }
330
331 static gboolean
_switch_to_next_column(CSVScanner * self)332 _switch_to_next_column(CSVScanner *self)
333 {
334 g_string_truncate(self->current_value, 0);
335
336 switch (self->state)
337 {
338 case CSV_STATE_INITIAL:
339 self->state = CSV_STATE_COLUMNS;
340 self->current_column = self->options->columns;
341 if (self->current_column)
342 return TRUE;
343 self->state = CSV_STATE_FINISH;
344 return FALSE;
345 case CSV_STATE_COLUMNS:
346 case CSV_STATE_GREEDY_COLUMN:
347 self->current_column = self->current_column->next;
348 if (self->current_column)
349 return TRUE;
350 self->state = CSV_STATE_FINISH;
351 return FALSE;
352 case CSV_STATE_PARTIAL_INPUT:
353 case CSV_STATE_FINISH:
354 return FALSE;
355 default:
356 break;
357 }
358 g_assert_not_reached();
359 }
360
361 gboolean
csv_scanner_scan_next(CSVScanner * self)362 csv_scanner_scan_next(CSVScanner *self)
363 {
364 if (!_switch_to_next_column(self))
365 return FALSE;
366
367 if (_is_last_column(self) && (self->options->flags & CSV_SCANNER_GREEDY))
368 {
369 g_string_assign(self->current_value, self->src);
370 self->src += self->current_value->len;
371 self->state = CSV_STATE_GREEDY_COLUMN;
372 return TRUE;
373 }
374 else if (self->src[0] == 0)
375 {
376 /* no more input data and a real column, not a greedy one */
377 self->state = CSV_STATE_PARTIAL_INPUT;
378 return FALSE;
379 }
380 else
381 {
382 _parse_opening_quote_character(self);
383 _parse_left_whitespace(self);
384 _parse_value_with_whitespace_and_delimiter(self);
385 _translate_value(self);
386 return TRUE;
387 }
388 }
389
390 const gchar *
csv_scanner_get_current_name(CSVScanner * self)391 csv_scanner_get_current_name(CSVScanner *self)
392 {
393 if (self->current_column)
394 return (const gchar *) self->current_column->data;
395 else if (self->state == CSV_STATE_INITIAL && self->options->columns)
396 return self->options->columns->data;
397 else
398 return NULL;
399 }
400
401 gboolean
csv_scanner_is_scan_complete(CSVScanner * self)402 csv_scanner_is_scan_complete(CSVScanner *self)
403 {
404 /* we didn't process all of the input */
405 if (self->src[0] != 0)
406 return FALSE;
407
408 return self->state == CSV_STATE_FINISH;
409 }
410
411 void
csv_scanner_init(CSVScanner * scanner,CSVScannerOptions * options,const gchar * input)412 csv_scanner_init(CSVScanner *scanner, CSVScannerOptions *options, const gchar *input)
413 {
414 memset(scanner, 0, sizeof(*scanner));
415 scanner->state = CSV_STATE_INITIAL;
416 scanner->src = input;
417 scanner->current_value = scratch_buffers_alloc();
418 scanner->current_column = NULL;
419 scanner->options = options;
420 }
421
422 void
csv_scanner_deinit(CSVScanner * self)423 csv_scanner_deinit(CSVScanner *self)
424 {
425 }
426
427 const gchar *
csv_scanner_get_current_value(CSVScanner * self)428 csv_scanner_get_current_value(CSVScanner *self)
429 {
430 return self->current_value->str;
431 }
432
433 gint
csv_scanner_get_current_value_len(CSVScanner * self)434 csv_scanner_get_current_value_len(CSVScanner *self)
435 {
436 return self->current_value->len;
437 }
438
439 gchar *
csv_scanner_dup_current_value(CSVScanner * self)440 csv_scanner_dup_current_value(CSVScanner *self)
441 {
442 return g_strndup(csv_scanner_get_current_value(self),
443 csv_scanner_get_current_value_len(self));
444 }
445