1 /*
2  * Copyright (c) 2002-2015 Balabit
3  * Copyright (c) 1998-2015 Balázs Scheidler
4  *
5  * This library is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU Lesser General Public
7  * License as published by the Free Software Foundation; either
8  * version 2.1 of the License, or (at your option) any later version.
9  *
10  * This library is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * Lesser General Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public
16  * License along with this library; if not, write to the Free Software
17  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
18  *
19  * As an additional exemption you are allowed to compile & link against the
20  * OpenSSL libraries as published by the OpenSSL project. See the file
21  * COPYING for details.
22  *
23  */
24 #include "csv-scanner.h"
25 #include "str-utils.h"
26 #include "string-list.h"
27 #include "scratch-buffers.h"
28 
29 #include <string.h>
30 
31 /************************************************************************
32  * CSVScannerOptions
33  ************************************************************************/
34 
35 void
csv_scanner_options_set_flags(CSVScannerOptions * options,guint32 flags)36 csv_scanner_options_set_flags(CSVScannerOptions *options, guint32 flags)
37 {
38   options->flags = flags;
39 }
40 
41 void
csv_scanner_options_set_dialect(CSVScannerOptions * options,CSVScannerDialect dialect)42 csv_scanner_options_set_dialect(CSVScannerOptions *options, CSVScannerDialect dialect)
43 {
44   options->dialect = dialect;
45 }
46 
47 void
csv_scanner_options_set_columns(CSVScannerOptions * options,GList * columns)48 csv_scanner_options_set_columns(CSVScannerOptions *options, GList *columns)
49 {
50   string_list_free(options->columns);
51   options->columns = columns;
52 }
53 
54 void
csv_scanner_options_set_delimiters(CSVScannerOptions * options,const gchar * delimiters)55 csv_scanner_options_set_delimiters(CSVScannerOptions *options, const gchar *delimiters)
56 {
57   g_free(options->delimiters);
58   options->delimiters = g_strdup(delimiters);
59 }
60 
61 void
csv_scanner_options_set_string_delimiters(CSVScannerOptions * options,GList * string_delimiters)62 csv_scanner_options_set_string_delimiters(CSVScannerOptions *options, GList *string_delimiters)
63 {
64   string_list_free(options->string_delimiters);
65   options->string_delimiters = string_delimiters;
66 }
67 
68 void
csv_scanner_options_set_quotes_start_and_end(CSVScannerOptions * options,const gchar * quotes_start,const gchar * quotes_end)69 csv_scanner_options_set_quotes_start_and_end(CSVScannerOptions *options, const gchar *quotes_start,
70                                              const gchar *quotes_end)
71 {
72   g_free(options->quotes_start);
73   g_free(options->quotes_end);
74   options->quotes_start = g_strdup(quotes_start);
75   options->quotes_end = g_strdup(quotes_end);
76 }
77 
78 void
csv_scanner_options_set_quotes(CSVScannerOptions * options,const gchar * quotes)79 csv_scanner_options_set_quotes(CSVScannerOptions *options, const gchar *quotes)
80 {
81   csv_scanner_options_set_quotes_start_and_end(options, quotes, quotes);
82 }
83 
84 void
csv_scanner_options_set_quote_pairs(CSVScannerOptions * options,const gchar * quote_pairs)85 csv_scanner_options_set_quote_pairs(CSVScannerOptions *options, const gchar *quote_pairs)
86 {
87   gint i;
88 
89   g_free(options->quotes_start);
90   g_free(options->quotes_end);
91 
92   options->quotes_start = g_malloc((strlen(quote_pairs) / 2) + 1);
93   options->quotes_end = g_malloc((strlen(quote_pairs) / 2) + 1);
94 
95   for (i = 0; quote_pairs[i] && quote_pairs[i+1]; i += 2)
96     {
97       options->quotes_start[i / 2] = quote_pairs[i];
98       options->quotes_end[i / 2] = quote_pairs[i + 1];
99     }
100   options->quotes_start[i / 2] = 0;
101   options->quotes_end[i / 2] = 0;
102 }
103 
104 
105 void
csv_scanner_options_set_null_value(CSVScannerOptions * options,const gchar * null_value)106 csv_scanner_options_set_null_value(CSVScannerOptions *options, const gchar *null_value)
107 {
108   g_free(options->null_value);
109   options->null_value = null_value && null_value[0] ? g_strdup(null_value) : NULL;
110 }
111 
112 void
csv_scanner_options_copy(CSVScannerOptions * dst,CSVScannerOptions * src)113 csv_scanner_options_copy(CSVScannerOptions *dst, CSVScannerOptions *src)
114 {
115   csv_scanner_options_set_delimiters(dst, src->delimiters);
116   csv_scanner_options_set_quotes_start_and_end(dst, src->quotes_start, src->quotes_end);
117   csv_scanner_options_set_null_value(dst, src->null_value);
118   csv_scanner_options_set_string_delimiters(dst, string_list_clone(src->string_delimiters));
119   csv_scanner_options_set_columns(dst, string_list_clone(src->columns));
120   dst->dialect = src->dialect;
121   dst->flags = src->flags;
122 }
123 
124 void
csv_scanner_options_clean(CSVScannerOptions * options)125 csv_scanner_options_clean(CSVScannerOptions *options)
126 {
127   g_free(options->quotes_start);
128   g_free(options->quotes_end);
129   g_free(options->null_value);
130   g_free(options->delimiters);
131   string_list_free(options->string_delimiters);
132   string_list_free(options->columns);
133 }
134 
135 /************************************************************************
136  * CSVScanner
137  ************************************************************************/
138 
139 static gboolean
_is_whitespace_char(const gchar * str)140 _is_whitespace_char(const gchar *str)
141 {
142   return (*str == ' ' || *str == '\t');
143 }
144 
145 static void
_skip_whitespace(const gchar ** src)146 _skip_whitespace(const gchar **src)
147 {
148   while (_is_whitespace_char(*src))
149     (*src)++;
150 }
151 
152 static void
_parse_opening_quote_character(CSVScanner * self)153 _parse_opening_quote_character(CSVScanner *self)
154 {
155   gchar *quote = _strchr_optimized_for_single_char_haystack(self->options->quotes_start, *self->src);
156 
157   if (quote != NULL)
158     {
159       /* ok, quote character found */
160       self->src++;
161       self->current_quote = self->options->quotes_end[quote - self->options->quotes_start];
162     }
163   else
164     {
165       /* we didn't start with a quote character, no need for escaping, delimiter terminates */
166       self->current_quote = 0;
167     }
168 }
169 
170 static void
_parse_left_whitespace(CSVScanner * self)171 _parse_left_whitespace(CSVScanner *self)
172 {
173   if ((self->options->flags & CSV_SCANNER_STRIP_WHITESPACE) == 0)
174     return;
175 
176   _skip_whitespace(&self->src);
177 }
178 
179 static void
_parse_character_with_quotation(CSVScanner * self)180 _parse_character_with_quotation(CSVScanner *self)
181 {
182   /* quoted character */
183   if (self->options->dialect == CSV_SCANNER_ESCAPE_BACKSLASH &&
184       *self->src == '\\' &&
185       *(self->src + 1))
186     {
187       self->src++;
188     }
189   else if (self->options->dialect == CSV_SCANNER_ESCAPE_DOUBLE_CHAR &&
190            *self->src == self->current_quote &&
191            *(self->src+1) == self->current_quote)
192     {
193       self->src++;
194     }
195   else if (*self->src == self->current_quote)
196     {
197       self->current_quote = 0;
198       self->src++;
199       return;
200     }
201   g_string_append_c(self->current_value, *self->src);
202   self->src++;
203 }
204 
205 /* searches for str in list and returns the first occurrence, otherwise NULL */
206 static gboolean
_match_string_delimiters_at_current_position(const char * input,GList * string_delimiters,int * result_length)207 _match_string_delimiters_at_current_position(const char *input, GList *string_delimiters, int *result_length)
208 {
209   GList *l;
210 
211   for (l = string_delimiters; l; l = l->next)
212     {
213       gint len = strlen(l->data);
214 
215       if (strncmp(input, l->data, len) == 0)
216         {
217           *result_length = len;
218           return TRUE;
219         }
220     }
221   return FALSE;
222 }
223 
224 static gboolean
_parse_string_delimiters_at_current_position(CSVScanner * self)225 _parse_string_delimiters_at_current_position(CSVScanner *self)
226 {
227   gint delim_len;
228 
229   if (!self->options->string_delimiters)
230     return FALSE;
231 
232   if (_match_string_delimiters_at_current_position(self->src,
233                                                    self->options->string_delimiters,
234                                                    &delim_len))
235     {
236       self->src += delim_len;
237       return TRUE;
238     }
239   return FALSE;
240 }
241 
242 static gboolean
_parse_character_delimiters_at_current_position(CSVScanner * self)243 _parse_character_delimiters_at_current_position(CSVScanner *self)
244 {
245   if (_strchr_optimized_for_single_char_haystack(self->options->delimiters, *self->src) != NULL)
246     {
247       self->src++;
248       return TRUE;
249     }
250   return FALSE;
251 }
252 
253 static gboolean
_parse_delimiter(CSVScanner * self)254 _parse_delimiter(CSVScanner *self)
255 {
256   if (_parse_string_delimiters_at_current_position(self))
257     return TRUE;
258 
259   if (_parse_character_delimiters_at_current_position(self))
260     return TRUE;
261 
262   return FALSE;
263 }
264 
265 static void
_parse_unquoted_literal_character(CSVScanner * self)266 _parse_unquoted_literal_character(CSVScanner *self)
267 {
268   g_string_append_c(self->current_value, *self->src);
269   self->src++;
270 }
271 
272 static void
_parse_value_with_whitespace_and_delimiter(CSVScanner * self)273 _parse_value_with_whitespace_and_delimiter(CSVScanner *self)
274 {
275   while (*self->src)
276     {
277       if (self->current_quote)
278         {
279           /* within quotation marks */
280           _parse_character_with_quotation(self);
281         }
282       else
283         {
284           /* unquoted value */
285           if (_parse_delimiter(self))
286             break;
287           _parse_unquoted_literal_character(self);
288         }
289     }
290 }
291 
292 static gint
_get_value_length_without_right_whitespace(CSVScanner * self)293 _get_value_length_without_right_whitespace(CSVScanner *self)
294 {
295   gint len = self->current_value->len;
296 
297   while (len > 0 && _is_whitespace_char(self->current_value->str + len - 1))
298     len--;
299 
300   return len;
301 }
302 
303 static void
_translate_rstrip_whitespace(CSVScanner * self)304 _translate_rstrip_whitespace(CSVScanner *self)
305 {
306   if (self->options->flags & CSV_SCANNER_STRIP_WHITESPACE)
307     g_string_truncate(self->current_value, _get_value_length_without_right_whitespace(self));
308 }
309 
310 static void
_translate_null_value(CSVScanner * self)311 _translate_null_value(CSVScanner *self)
312 {
313   if (self->options->null_value &&
314       strcmp(self->current_value->str, self->options->null_value) == 0)
315     g_string_truncate(self->current_value, 0);
316 }
317 
318 static void
_translate_value(CSVScanner * self)319 _translate_value(CSVScanner *self)
320 {
321   _translate_rstrip_whitespace(self);
322   _translate_null_value(self);
323 }
324 
325 static gboolean
_is_last_column(CSVScanner * self)326 _is_last_column(CSVScanner *self)
327 {
328   return self->current_column && self->current_column->next == NULL;
329 }
330 
331 static gboolean
_switch_to_next_column(CSVScanner * self)332 _switch_to_next_column(CSVScanner *self)
333 {
334   g_string_truncate(self->current_value, 0);
335 
336   switch (self->state)
337     {
338     case CSV_STATE_INITIAL:
339       self->state = CSV_STATE_COLUMNS;
340       self->current_column = self->options->columns;
341       if (self->current_column)
342         return TRUE;
343       self->state = CSV_STATE_FINISH;
344       return FALSE;
345     case CSV_STATE_COLUMNS:
346     case CSV_STATE_GREEDY_COLUMN:
347       self->current_column = self->current_column->next;
348       if (self->current_column)
349         return TRUE;
350       self->state = CSV_STATE_FINISH;
351       return FALSE;
352     case CSV_STATE_PARTIAL_INPUT:
353     case CSV_STATE_FINISH:
354       return FALSE;
355     default:
356       break;
357     }
358   g_assert_not_reached();
359 }
360 
361 gboolean
csv_scanner_scan_next(CSVScanner * self)362 csv_scanner_scan_next(CSVScanner *self)
363 {
364   if (!_switch_to_next_column(self))
365     return FALSE;
366 
367   if (_is_last_column(self) && (self->options->flags & CSV_SCANNER_GREEDY))
368     {
369       g_string_assign(self->current_value, self->src);
370       self->src += self->current_value->len;
371       self->state = CSV_STATE_GREEDY_COLUMN;
372       return TRUE;
373     }
374   else if (self->src[0] == 0)
375     {
376       /* no more input data and a real column, not a greedy one */
377       self->state = CSV_STATE_PARTIAL_INPUT;
378       return FALSE;
379     }
380   else
381     {
382       _parse_opening_quote_character(self);
383       _parse_left_whitespace(self);
384       _parse_value_with_whitespace_and_delimiter(self);
385       _translate_value(self);
386       return TRUE;
387     }
388 }
389 
390 const gchar *
csv_scanner_get_current_name(CSVScanner * self)391 csv_scanner_get_current_name(CSVScanner *self)
392 {
393   if (self->current_column)
394     return (const gchar *) self->current_column->data;
395   else if (self->state == CSV_STATE_INITIAL && self->options->columns)
396     return self->options->columns->data;
397   else
398     return NULL;
399 }
400 
401 gboolean
csv_scanner_is_scan_complete(CSVScanner * self)402 csv_scanner_is_scan_complete(CSVScanner *self)
403 {
404   /* we didn't process all of the input */
405   if (self->src[0] != 0)
406     return FALSE;
407 
408   return self->state == CSV_STATE_FINISH;
409 }
410 
411 void
csv_scanner_init(CSVScanner * scanner,CSVScannerOptions * options,const gchar * input)412 csv_scanner_init(CSVScanner *scanner, CSVScannerOptions *options, const gchar *input)
413 {
414   memset(scanner, 0, sizeof(*scanner));
415   scanner->state = CSV_STATE_INITIAL;
416   scanner->src = input;
417   scanner->current_value = scratch_buffers_alloc();
418   scanner->current_column = NULL;
419   scanner->options = options;
420 }
421 
422 void
csv_scanner_deinit(CSVScanner * self)423 csv_scanner_deinit(CSVScanner *self)
424 {
425 }
426 
427 const gchar *
csv_scanner_get_current_value(CSVScanner * self)428 csv_scanner_get_current_value(CSVScanner *self)
429 {
430   return self->current_value->str;
431 }
432 
433 gint
csv_scanner_get_current_value_len(CSVScanner * self)434 csv_scanner_get_current_value_len(CSVScanner *self)
435 {
436   return self->current_value->len;
437 }
438 
439 gchar *
csv_scanner_dup_current_value(CSVScanner * self)440 csv_scanner_dup_current_value(CSVScanner *self)
441 {
442   return g_strndup(csv_scanner_get_current_value(self),
443                    csv_scanner_get_current_value_len(self));
444 }
445