1 /*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 2005, 2013 Oracle and/or its affiliates. All rights reserved.
5 *
6 * $Id$
7 */
8
9 #include "csv.h"
10 #include "csv_local.h"
11 #include "csv_extern.h"
12
13 typedef enum { GL_OK, GL_EOF, GL_FAIL } getline_status;
14
15 static int input_field_count(const char *, size_t, u_int32_t *);
16 static getline_status
17 input_getline(char **, size_t *, size_t *);
18 static int input_put_alloc(u_int32_t **, size_t *, size_t, u_int32_t);
19 static int input_set_offset(u_int32_t *, char *, size_t, u_int32_t);
20
21 static input_fmt ifmt; /* Input format. */
22 static u_long record_count = 0; /* Input record count for errors. */
23 static u_long version; /* Version we're loading. */
24
25 /*
26 * input_load --
27 * Read the input file and load new records into the database.
28 */
29 int
input_load(input_fmt ifmt_arg,u_long version_arg)30 input_load(input_fmt ifmt_arg, u_long version_arg)
31 {
32 getline_status gtl_status;
33 DBT key, data;
34 DBC *cursor;
35 u_int32_t field_count, primary_key, *put_line;
36 size_t input_len, len, put_len;
37 int is_first, ret;
38 char *input_line;
39
40 field_count = 0; /* Shut the compiler up. */
41
42 /* ifmt and version are global to this file. */
43 ifmt = ifmt_arg;
44 version = version_arg;
45
46 /*
47 * The primary key for the database is a unique number. Find out the
48 * last unique number allocated in this database by opening a cursor
49 * and fetching the last record.
50 */
51 if ((ret = db->cursor(db, NULL, &cursor, 0)) != 0) {
52 dbenv->err(dbenv, ret, "DB->cursor");
53 return (1);
54 }
55 memset(&key, 0, sizeof(key));
56 memset(&data, 0, sizeof(data));
57 if ((ret = cursor->c_get(cursor, &key, &data, DB_LAST)) != 0)
58 if (ret == DB_NOTFOUND)
59 primary_key = 0;
60 else {
61 dbenv->err(dbenv, ret, "DB->cursor: DB_LAST");
62 return (1);
63 }
64 else
65 memcpy(&primary_key, key.data, sizeof(primary_key));
66 if ((ret = cursor->c_close(cursor)) != 0) {
67 dbenv->err(dbenv, ret, "DBC->close");
68 return (1);
69 }
70 if (verbose)
71 dbenv->errx(dbenv,
72 "maximum existing record in the database is %lu",
73 (u_long)primary_key);
74
75 key.data = &primary_key;
76 key.size = sizeof(primary_key);
77 input_line = NULL;
78 put_line = NULL;
79 input_len = put_len = 0;
80
81 /*
82 * See the README file for a description of the file input format.
83 */
84 for (is_first = 1; (gtl_status =
85 input_getline(&input_line, &input_len, &len)) == GL_OK;) {
86 ++record_count;
87 if (verbose > 1)
88 dbenv->errx(dbenv, "reading %lu", (u_long)record_count);
89
90 /* The first non-blank line of the input is a column map. */
91 if (is_first) {
92 is_first = 0;
93
94 /* Count the fields we're expecting in the input. */
95 if (input_field_count(
96 input_line, len, &field_count) != 0)
97 return (1);
98
99 }
100
101 /* Allocate room for the table of offsets. */
102 if (input_put_alloc(
103 &put_line, &put_len, len, field_count) != 0)
104 return (1);
105
106 /*
107 * Build the offset table and create the record we're
108 * going to store.
109 */
110 if (input_set_offset(put_line,
111 input_line, len, field_count) != 0)
112 return (1);
113
114 ++primary_key;
115
116 memcpy(put_line + (field_count + 2), input_line, len);
117 data.data = put_line;
118 data.size = (u_int32_t)
119 ((field_count + 2) * sizeof(u_int32_t) + len);
120
121 if (verbose > 1)
122 (void)entry_print(
123 data.data, data.size, field_count);
124
125 /* Load the key/data pair into the database. */
126 if ((ret = db->put(db, NULL, &key, &data, 0)) != 0) {
127 dbenv->err(dbenv, ret,
128 "DB->put: %lu", (u_long)primary_key);
129 return (1);
130 }
131 }
132
133 if (gtl_status != GL_EOF)
134 return (1);
135
136 if (verbose)
137 dbenv->errx(dbenv,
138 "%lu records read from the input file into the database",
139 record_count);
140
141 /*
142 * This program isn't transactional, limit the window for corruption.
143 */
144 if ((ret = db->sync(db, 0)) != 0) {
145 dbenv->err(dbenv, ret, "DB->sync");
146 return (1);
147 }
148
149 return (0);
150 }
151
152 /*
153 * input_getline --
154 * Read in a line of input into a buffer.
155 */
156 static getline_status
input_getline(char ** input_linep,size_t * input_lenp,size_t * lenp)157 input_getline(char **input_linep, size_t *input_lenp, size_t *lenp)
158 {
159 size_t input_len, len;
160 int ch;
161 char *input_line, *p, *endp;
162
163 input_line = *input_linep;
164 input_len = *input_lenp;
165
166 p = input_line;
167 endp = input_line + input_len;
168
169 for (len = 0; (ch = getchar()) != EOF;) {
170 if (ch == '\0') /* Strip <nul> (\000) bytes. */
171 continue;
172 switch (ifmt) {
173 case FORMAT_NL:
174 if (ch == '\n')
175 goto end;
176 break;
177 case FORMAT_EXCEL:
178 /* Strip <nl> (\012) bytes. */
179 if (ch == '\n')
180 continue;
181 /*
182 * <cr> (\015) bytes terminate lines.
183 * Skip blank lines.
184 */
185 if (ch == '\015') {
186 if (len == 0)
187 continue;
188 goto end;
189 }
190 }
191 if (input_line == endp) {
192 input_len += 256;
193 input_len *= 2;
194 if ((input_line =
195 realloc(input_line, input_len)) == NULL) {
196 dbenv->err(dbenv, errno,
197 "unable to allocate %lu bytes for record",
198 (u_long)input_len);
199 return (GL_FAIL);
200 }
201 p = input_line;
202 endp = p + input_len;
203 }
204
205 if (isprint(ch)) { /* Strip unprintables. */
206 *p++ = (char)ch;
207 ++len;
208 }
209 }
210
211 end: if (len == 0)
212 return (GL_EOF);
213
214 *lenp = len;
215 *input_linep = input_line;
216 *input_lenp = input_len;
217
218 return (GL_OK);
219 }
220
221 /*
222 * input_field_count --
223 * Count the fields in the line.
224 */
225 static int
input_field_count(const char * line,size_t len,u_int32_t * field_countp)226 input_field_count(const char *line, size_t len, u_int32_t *field_countp)
227 {
228 u_int32_t field_count;
229 int quoted;
230
231 field_count = 1;
232
233 /*
234 * There are N-1 separators for N fields, that is, "a,b,c" is three
235 * fields, with two comma separators.
236 */
237 switch (ifmt) {
238 case FORMAT_EXCEL:
239 quoted = 0;
240 for (field_count = 1; len > 0; ++line, --len)
241 if (*line == '"')
242 quoted = !quoted;
243 else if (*line == ',' && !quoted)
244 ++field_count;
245 break;
246 case FORMAT_NL:
247 for (field_count = 1; len > 0; ++line, --len)
248 if (*line == ',')
249 ++field_count;
250 break;
251 }
252 *field_countp = field_count;
253
254 if (verbose)
255 dbenv->errx(dbenv,
256 "input file made up of %lu fields", (u_int)field_count);
257
258 return (0);
259 }
260
261 /*
262 * input_put_alloc --
263 * Allocate room for the offset table plus the input.
264 */
265 static int
input_put_alloc(u_int32_t ** put_linep,size_t * put_lenp,size_t len,u_int32_t field_count)266 input_put_alloc(u_int32_t **put_linep,
267 size_t *put_lenp, size_t len, u_int32_t field_count)
268 {
269 size_t total;
270
271 total = (field_count + 2) * sizeof(u_int32_t) + len;
272 if (total > *put_lenp &&
273 (*put_linep = realloc(*put_linep, *put_lenp += total)) == NULL) {
274 dbenv->err(dbenv, errno,
275 "unable to allocate %lu bytes for record",
276 (u_long)*put_lenp);
277 return (1);
278 }
279 return (0);
280 }
281
282 /*
283 * input_set_offset --
284 * Build an offset table and record combination.
285 */
286 static int
input_set_offset(u_int32_t * put_line,char * input_line,size_t len,u_int32_t field_count)287 input_set_offset(u_int32_t *put_line,
288 char *input_line, size_t len, u_int32_t field_count)
289 {
290 u_int32_t *op;
291 int quoted;
292 char *p, *endp;
293
294 op = put_line;
295
296 /* The first field is the version number. */
297 *op++ = version;
298
299 /*
300 * Walk the input line, looking for comma separators. It's an error
301 * to have too many or too few fields.
302 */
303 *op++ = 0;
304 quoted = 0;
305 for (p = input_line, endp = input_line + len;; ++p) {
306 if (ifmt == FORMAT_EXCEL && p < endp) {
307 if (*p == '"')
308 quoted = !quoted;
309 if (quoted)
310 continue;
311 }
312 if (*p == ',' || p == endp) {
313 if (field_count == 0) {
314 dbenv->errx(dbenv,
315 "record %lu: too many fields in the record",
316 record_count);
317 return (1);
318 }
319 --field_count;
320
321 *op++ = (u_int32_t)(p - input_line) + 1;
322
323 if (verbose > 1)
324 dbenv->errx(dbenv,
325 "offset %lu: {%.*s}", op[-1],
326 OFFSET_LEN(op, -2), input_line + op[-2]);
327
328 /*
329 * Don't insert a new field if the input lines ends
330 * in a comma.
331 */
332 if (p == endp || p + 1 == endp)
333 break;
334 }
335 }
336 *op++ = (u_int32_t)(p - input_line);
337
338 if (field_count != 0) {
339 dbenv->errx(dbenv,
340 "record %lu: not enough fields in the record",
341 record_count);
342 return (1);
343 }
344 memcpy(op, input_line, len);
345
346 return (0);
347 }
348