1 /*-
2  * Copyright (c) 2005, 2020 Oracle and/or its affiliates.  All rights reserved.
3  *
4  * See the file EXAMPLES-LICENSE for license information.
5  *
6  * $Id$
7  */
8 
9 #include "csv.h"
10 #include "csv_local.h"
11 #include "csv_extern.h"
12 
13 typedef enum { GL_OK, GL_EOF, GL_FAIL } getline_status;
14 
15 static int input_field_count(const char *, size_t, u_int32_t *);
16 static getline_status
17 	   input_getline(char **, size_t *, size_t *);
18 static int input_put_alloc(u_int32_t **, size_t *, size_t, u_int32_t);
19 static int input_set_offset(u_int32_t *, char *, size_t, u_int32_t);
20 
21 static input_fmt ifmt;			/* Input format. */
22 static u_long	 record_count = 0;	/* Input record count for errors. */
23 static u_long	 version;		/* Version we're loading. */
24 
25 /*
26  * input_load --
27  *	Read the input file and load new records into the database.
28  */
29 int
input_load(input_fmt ifmt_arg,u_long version_arg)30 input_load(input_fmt ifmt_arg, u_long version_arg)
31 {
32 	getline_status gtl_status;
33 	DBT key, data;
34 	DBC *cursor;
35 	u_int32_t field_count, primary_key, *put_line;
36 	size_t input_len, len, put_len;
37 	int is_first, ret;
38 	char *input_line;
39 
40 	field_count = 0;			/* Shut the compiler up. */
41 
42 	/* ifmt and version are global to this file. */
43 	ifmt = ifmt_arg;
44 	version = version_arg;
45 
46 	/*
47 	 * The primary key for the database is a unique number.  Find out the
48 	 * last unique number allocated in this database by opening a cursor
49 	 * and fetching the last record.
50 	 */
51 	if ((ret = db->cursor(db, NULL, &cursor, 0)) != 0) {
52 		dbenv->err(dbenv, ret, "DB->cursor");
53 		return (1);
54 	}
55 	memset(&key, 0, sizeof(key));
56 	memset(&data, 0, sizeof(data));
57 	if ((ret = cursor->c_get(cursor, &key, &data, DB_LAST)) != 0)
58 		if (ret == DB_NOTFOUND)
59 			primary_key = 0;
60 		else {
61 			dbenv->err(dbenv, ret, "DB->cursor: DB_LAST");
62 			return (1);
63 		}
64 	else
65 		memcpy(&primary_key, key.data, sizeof(primary_key));
66 	if ((ret = cursor->c_close(cursor)) != 0) {
67 		dbenv->err(dbenv, ret, "DBC->close");
68 		return (1);
69 	}
70 	if (verbose)
71 		dbenv->errx(dbenv,
72 		    "maximum existing record in the database is %lu",
73 		    (u_long)primary_key);
74 
75 	key.data = &primary_key;
76 	key.size = sizeof(primary_key);
77 	input_line = NULL;
78 	put_line = NULL;
79 	input_len = put_len = 0;
80 
81 	/*
82 	 * See the README file for a description of the file input format.
83 	 */
84 	for (is_first = 1; (gtl_status =
85 	    input_getline(&input_line, &input_len, &len)) == GL_OK;) {
86 		++record_count;
87 		if (verbose > 1)
88 			dbenv->errx(dbenv, "reading %lu", (u_long)record_count);
89 
90 		/* The first non-blank line of the input is a column map. */
91 		if (is_first) {
92 			is_first = 0;
93 
94 			/* Count the fields we're expecting in the input. */
95 			if (input_field_count(
96 			    input_line, len, &field_count) != 0)
97 				return (1);
98 
99 		}
100 
101 		/* Allocate room for the table of offsets. */
102 		if (input_put_alloc(
103 		    &put_line, &put_len, len, field_count) != 0)
104 			return (1);
105 
106 		/*
107 		 * Build the offset table and create the record we're
108 		 * going to store.
109 		 */
110 		if (input_set_offset(put_line,
111 		    input_line, len, field_count) != 0)
112 			return (1);
113 
114 		++primary_key;
115 
116 		memcpy(put_line + (field_count + 2), input_line, len);
117 		data.data = put_line;
118 		data.size = (u_int32_t)
119 			((field_count + 2) * sizeof(u_int32_t) + len);
120 
121 		if (verbose > 1)
122 			(void)entry_print(
123 			    data.data, data.size, field_count);
124 
125 		/* Load the key/data pair into the database. */
126 		if ((ret = db->put(db, NULL, &key, &data, 0)) != 0) {
127 			dbenv->err(dbenv, ret,
128 			    "DB->put: %lu", (u_long)primary_key);
129 			return (1);
130 		}
131 	}
132 
133 	if (gtl_status != GL_EOF)
134 		return (1);
135 
136 	if (verbose)
137 		dbenv->errx(dbenv,
138 		    "%lu records read from the input file into the database",
139 		    record_count);
140 
141 	/*
142 	 * This program isn't transactional, limit the window for corruption.
143 	 */
144 	if ((ret = db->sync(db, 0)) != 0) {
145 		dbenv->err(dbenv, ret, "DB->sync");
146 		return (1);
147 	}
148 
149 	return (0);
150 }
151 
152 /*
153  * input_getline --
154  *	Read in a line of input into a buffer.
155  */
156 static getline_status
input_getline(char ** input_linep,size_t * input_lenp,size_t * lenp)157 input_getline(char **input_linep, size_t *input_lenp, size_t *lenp)
158 {
159 	size_t input_len, len;
160 	int ch;
161 	char *input_line, *p, *endp;
162 
163 	input_line = *input_linep;
164 	input_len = *input_lenp;
165 
166 	p = input_line;
167 	endp = input_line + input_len;
168 
169 	for (len = 0; (ch = getchar()) != EOF;) {
170 		if (ch == '\0')		/* Strip <nul> (\000) bytes. */
171 			continue;
172 		switch (ifmt) {
173 		case FORMAT_NL:
174 			if (ch == '\n')
175 				goto end;
176 			break;
177 		case FORMAT_EXCEL:
178 			/* Strip <nl> (\012) bytes. */
179 			if (ch == '\n')
180 				continue;
181 			/*
182 			 * <cr> (\015) bytes terminate lines.
183 			 * Skip blank lines.
184 			 */
185 			if (ch == '\015') {
186 				if (len == 0)
187 					continue;
188 				goto end;
189 			}
190 		}
191 		if (input_line == endp) {
192 			input_len += 256;
193 			input_len *= 2;
194 			if ((input_line =
195 			    realloc(input_line, input_len)) == NULL) {
196 				dbenv->err(dbenv, errno,
197 				    "unable to allocate %lu bytes for record",
198 				    (u_long)input_len);
199 				return (GL_FAIL);
200 			}
201 			p = input_line;
202 			endp = p + input_len;
203 		}
204 
205 		if (isprint(ch)) {	/* Strip unprintables. */
206 			*p++ = (char)ch;
207 			++len;
208 		}
209 	}
210 
211 end:	if (len == 0)
212 		return (GL_EOF);
213 
214 	*lenp = len;
215 	*input_linep = input_line;
216 	*input_lenp = input_len;
217 
218 	return (GL_OK);
219 }
220 
221 /*
222  * input_field_count --
223  *	Count the fields in the line.
224  */
225 static int
input_field_count(const char * line,size_t len,u_int32_t * field_countp)226 input_field_count(const char *line, size_t len, u_int32_t *field_countp)
227 {
228 	u_int32_t field_count;
229 	int quoted;
230 
231 	field_count = 1;
232 
233 	/*
234 	 * There are N-1 separators for N fields, that is, "a,b,c" is three
235 	 * fields, with two comma separators.
236 	 */
237 	switch (ifmt) {
238 	case FORMAT_EXCEL:
239 		quoted = 0;
240 		for (field_count = 1; len > 0; ++line, --len)
241 			if (*line == '"')
242 				quoted = !quoted;
243 			else if (*line == ',' && !quoted)
244 				++field_count;
245 		break;
246 	case FORMAT_NL:
247 		for (field_count = 1; len > 0; ++line, --len)
248 			if (*line == ',')
249 				++field_count;
250 		break;
251 	}
252 	*field_countp = field_count;
253 
254 	if (verbose)
255 		dbenv->errx(dbenv,
256 		    "input file made up of %lu fields", (u_int)field_count);
257 
258 	return (0);
259 }
260 
261 /*
262  * input_put_alloc --
263  *	Allocate room for the offset table plus the input.
264  */
265 static int
input_put_alloc(u_int32_t ** put_linep,size_t * put_lenp,size_t len,u_int32_t field_count)266 input_put_alloc(u_int32_t **put_linep,
267     size_t *put_lenp, size_t len, u_int32_t field_count)
268 {
269 	size_t total;
270 
271 	total = (field_count + 2) * sizeof(u_int32_t) + len;
272 	if (total > *put_lenp &&
273 	    (*put_linep = realloc(*put_linep, *put_lenp += total)) == NULL) {
274 		dbenv->err(dbenv, errno,
275 		    "unable to allocate %lu bytes for record",
276 		    (u_long)*put_lenp);
277 		return (1);
278 	}
279 	return (0);
280 }
281 
282 /*
283  * input_set_offset --
284  *	Build an offset table and record combination.
285  */
286 static int
input_set_offset(u_int32_t * put_line,char * input_line,size_t len,u_int32_t field_count)287 input_set_offset(u_int32_t *put_line,
288     char *input_line, size_t len, u_int32_t field_count)
289 {
290 	u_int32_t *op;
291 	int quoted;
292 	char *p, *endp;
293 
294 	op = put_line;
295 
296 	/* The first field is the version number. */
297 	*op++ = version;
298 
299 	/*
300 	 * Walk the input line, looking for comma separators.  It's an error
301 	 * to have too many or too few fields.
302 	 */
303 	*op++ = 0;
304 	quoted = 0;
305 	for (p = input_line, endp = input_line + len;; ++p) {
306 		if (ifmt == FORMAT_EXCEL && p < endp) {
307 			if (*p == '"')
308 				quoted = !quoted;
309 			if (quoted)
310 				continue;
311 		}
312 		if (*p == ',' || p == endp) {
313 			if (field_count == 0) {
314 				dbenv->errx(dbenv,
315 				    "record %lu: too many fields in the record",
316 				    record_count);
317 				return (1);
318 			}
319 			--field_count;
320 
321 			*op++ = (u_int32_t)(p - input_line) + 1;
322 
323 			if (verbose > 1)
324 				dbenv->errx(dbenv,
325 				    "offset %lu: {%.*s}", op[-1],
326 				    OFFSET_LEN(op, -2), input_line + op[-2]);
327 
328 			/*
329 			 * Don't insert a new field if the input lines ends
330 			 * in a comma.
331 			 */
332 			if (p == endp || p + 1 == endp)
333 				break;
334 		}
335 	}
336 	*op++ = (u_int32_t)(p - input_line);
337 
338 	if (field_count != 0) {
339 		dbenv->errx(dbenv,
340 		    "record %lu: not enough fields in the record",
341 		    record_count);
342 		return (1);
343 	}
344 	memcpy(op, input_line, len);
345 
346 	return (0);
347 }
348