1 #include "lib/mlr_globals.h"
2 #include "lib/mlrutil.h"
3 #include "cli/json_array_ingest.h"
4 #include "input/mlr_json_adapter.h"
5 
6 static lrec_t* validate_millerable_object(json_value_t* pjson_object, char* flatten_sep,
7 	json_array_ingest_t json_array_ingest);
8 static int populate_from_nested_object(lrec_t* prec, json_value_t* pjson_object, char* prefix, char* flatten_sep,
9 	json_array_ingest_t json_array_ingest);
10 static int populate_from_nested_array(lrec_t* prec, json_value_t* pjson_array, char* prefix, char* flatten_sep,
11 	json_array_ingest_t json_array_ingest);
12 
13 // ----------------------------------------------------------------
reference_json_objects_as_lrecs(sllv_t * precords,json_value_t * ptop_level_json,char * flatten_sep,json_array_ingest_t json_array_ingest)14 int reference_json_objects_as_lrecs(sllv_t* precords, json_value_t* ptop_level_json, char* flatten_sep,
15 	json_array_ingest_t json_array_ingest)
16 {
17 	if (ptop_level_json->type == JSON_ARRAY) {
18 		int n = ptop_level_json->u.array.length;
19 		for (int i = 0; i < n; i++) {
20 			json_value_t* pnext_level_json = ptop_level_json->u.array.values[i];
21 			if (pnext_level_json->type != JSON_OBJECT) {
22 				fprintf(stderr,
23 					"%s: found non-object (type %s) within top-level array. This is valid but unmillerable JSON.\n",
24 					MLR_GLOBALS.bargv0, json_describe_type(ptop_level_json->type));
25 				return FALSE;
26 			}
27 			lrec_t* prec = validate_millerable_object(pnext_level_json, flatten_sep, json_array_ingest);
28 			if (prec == NULL)
29 				return FALSE;
30 			sllv_append(precords, prec);
31 		}
32 	} else if (ptop_level_json->type == JSON_OBJECT) {
33 		lrec_t* prec = validate_millerable_object(ptop_level_json, flatten_sep, json_array_ingest);
34 		if (prec == NULL)
35 			return FALSE;
36 		sllv_append(precords, prec);
37 	} else {
38 		fprintf(stderr,
39 			"%s: found non-terminal (type %s) at top level. This is valid but unmillerable JSON.\n",
40 			MLR_GLOBALS.bargv0, json_describe_type(ptop_level_json->type));
41 		return FALSE;
42 	}
43 	return TRUE;
44 }
45 
46 // ----------------------------------------------------------------
47 // Returns NULL if the JSON object is not millerable, else returns a new lrec with string pointers
48 // backed by the JSON object.
49 //
50 // Precondition: the JSON value is assumed to have already been checked to be of type JSON_OBJECT.
51 
validate_millerable_object(json_value_t * pjson,char * flatten_sep,json_array_ingest_t json_array_ingest)52 lrec_t* validate_millerable_object(json_value_t* pjson, char* flatten_sep, json_array_ingest_t json_array_ingest) {
53 	lrec_t* prec = lrec_unbacked_alloc();
54 	int n = pjson->u.array.length;
55 	for (int i = 0; i < n; i++) {
56 		json_object_entry_t* pobject_entry = &pjson->u.object.p.values[i];
57 		char* key = (char*)pobject_entry->name;
58 		char* prefix = NULL;
59 
60 		json_value_t* pjson_value = pobject_entry->pvalue;
61 		switch (pjson_value->type) {
62 
63 		case JSON_NONE:
64 			lrec_put(prec, key, "", NO_FREE);
65 			break;
66 		case JSON_NULL:
67 			lrec_put(prec, key, "", NO_FREE);
68 			break;
69 
70 		case JSON_OBJECT:
71 			// This could be made more efficient ... the string length is in the json_value_t.
72 			prefix = mlr_paste_2_strings(key, flatten_sep);
73 			if (!populate_from_nested_object(prec, pjson_value, prefix, flatten_sep, json_array_ingest))
74 				return NULL;
75 			free(prefix);
76 			break;
77 		case JSON_ARRAY:
78 			switch (json_array_ingest) {
79 			case JSON_ARRAY_INGEST_FATAL:
80 				fprintf(stderr,
81 					"%s: found array item within JSON object. This is valid but unmillerable JSON.\n"
82 					"Use --json-skip-arrays-on-input to exclude these from input without fataling.\n"
83 					"Or, --json-map-arrays-on-input to convert them to integer-indexed maps.\n",
84 					MLR_GLOBALS.bargv0);
85 				return NULL;
86 				break;
87 			case JSON_ARRAY_INGEST_AS_MAP:
88 				prefix = mlr_paste_2_strings(key, flatten_sep);
89 				if (!populate_from_nested_array(prec, pjson_value, prefix, flatten_sep, json_array_ingest)) {
90 					free(prefix);
91 					return NULL;
92 				}
93 				free(prefix);
94 				break;
95 			// xxx other cases!
96 			default:
97 				break;
98 			}
99 			break;
100 
101 		case JSON_STRING:
102 			lrec_put(prec, key, pjson_value->u.string.ptr, NO_FREE);
103 			break;
104 
105 		case JSON_BOOLEAN:
106 			lrec_put(prec, key, pjson_value->u.boolean.sval, NO_FREE);
107 			break;
108 		case JSON_INTEGER:
109 			lrec_put(prec, key, pjson_value->u.integer.sval, NO_FREE);
110 			break;
111 		case JSON_DOUBLE:
112 			lrec_put(prec, key, pjson_value->u.dbl.sval, NO_FREE);
113 			break;
114 		default:
115 			MLR_INTERNAL_CODING_ERROR();
116 			break;
117 		}
118 
119 	}
120 	return prec;
121 }
122 
123 // ----------------------------------------------------------------
124 // Example: the JSON object has { "a": { "b" : 1, "c" : 2 } }. Then we add "a:b" => "1" and "a:c" => "2"
125 // to the lrec.
126 
populate_from_nested_object(lrec_t * prec,json_value_t * pjson_object,char * prefix,char * flatten_sep,json_array_ingest_t json_array_ingest)127 static int populate_from_nested_object(lrec_t* prec, json_value_t* pjson_object, char* prefix, char* flatten_sep,
128 	json_array_ingest_t json_array_ingest)
129 {
130 	int n = pjson_object->u.object.length;
131 	for (int i = 0; i < n; i++) {
132 		json_object_entry_t* pobject_entry = &pjson_object->u.object.p.values[i];
133 		char* json_key = (char*)pobject_entry->name;
134 		json_value_t* pjson_value = pobject_entry->pvalue;
135 		char* lrec_key = mlr_paste_2_strings(prefix, json_key);
136 		char* next_prefix = NULL;
137 
138 		switch (pjson_value->type) {
139 		case JSON_NONE:
140 			lrec_put(prec, lrec_key, "", FREE_ENTRY_KEY);
141 			break;
142 		case JSON_NULL:
143 			lrec_put(prec, lrec_key, "", FREE_ENTRY_KEY);
144 			break;
145 		case JSON_STRING:
146 			lrec_put(prec, lrec_key, pjson_value->u.string.ptr, FREE_ENTRY_KEY);
147 			break;
148 		case JSON_BOOLEAN:
149 			lrec_put(prec, lrec_key, pjson_value->u.boolean.sval, FREE_ENTRY_KEY);
150 			break;
151 		case JSON_OBJECT:
152 			next_prefix = mlr_paste_2_strings(lrec_key, flatten_sep);
153 			if (!populate_from_nested_object(prec, pjson_value, next_prefix, flatten_sep, json_array_ingest))
154 				return FALSE;
155 			free(next_prefix);
156 			free(lrec_key);
157 			break;
158 		case JSON_ARRAY:
159 			switch (json_array_ingest) {
160 			case JSON_ARRAY_INGEST_FATAL:
161 				fprintf(stderr,
162 					"%s: found array item within JSON object. This is valid but unmillerable JSON.\n"
163 					"Use --json-skip-arrays-on-input to exclude these from input without fataling.\n"
164 					"Or, --json-map-arrays-on-input to convert them to integer-indexed maps.\n",
165 					MLR_GLOBALS.bargv0);
166 				free(lrec_key);
167 				return FALSE;
168 				break;
169 			case JSON_ARRAY_INGEST_AS_MAP:
170 				next_prefix = mlr_paste_2_strings(lrec_key, flatten_sep);
171 				if (!populate_from_nested_array(prec, pjson_value, next_prefix, flatten_sep, json_array_ingest)) {
172 					free(next_prefix);
173 					free(lrec_key);
174 					return FALSE;
175 				}
176 				free(next_prefix);
177 				free(lrec_key);
178 				break;
179 			// xxx other cases!
180 			default:
181 				free(lrec_key);
182 				break;
183 			}
184 			break;
185 		case JSON_INTEGER:
186 			lrec_put(prec, lrec_key, pjson_value->u.integer.sval, FREE_ENTRY_KEY);
187 			break;
188 		case JSON_DOUBLE:
189 			lrec_put(prec, lrec_key, pjson_value->u.dbl.sval, FREE_ENTRY_KEY);
190 			break;
191 		default:
192 			MLR_INTERNAL_CODING_ERROR();
193 			break;
194 		}
195 	}
196 	return TRUE;
197 }
198 
populate_from_nested_array(lrec_t * prec,json_value_t * pjson_array,char * prefix,char * flatten_sep,json_array_ingest_t json_array_ingest)199 static int populate_from_nested_array(lrec_t* prec, json_value_t* pjson_array, char* prefix, char* flatten_sep,
200 	json_array_ingest_t json_array_ingest)
201 {
202 	int n = pjson_array->u.array.length;
203 	for (int i = 0; i < n; i++) {
204 		json_value_t* pjson_value = pjson_array->u.array.values[i];
205 
206 		char free_flags = NO_FREE;
207 		char* json_key = low_int_to_string(i, &free_flags);
208 		char* lrec_key = mlr_paste_2_strings(prefix, json_key);
209 		if (free_flags)
210 			free(json_key);
211 		char* next_prefix = NULL;
212 
213 		switch (pjson_value->type) {
214 		case JSON_NONE:
215 			lrec_put(prec, lrec_key, "", FREE_ENTRY_KEY);
216 			break;
217 		case JSON_NULL:
218 			lrec_put(prec, lrec_key, "", FREE_ENTRY_KEY);
219 			break;
220 		case JSON_STRING:
221 			lrec_put(prec, lrec_key, pjson_value->u.string.ptr, FREE_ENTRY_KEY);
222 			break;
223 		case JSON_BOOLEAN:
224 			lrec_put(prec, lrec_key, pjson_value->u.boolean.sval, FREE_ENTRY_KEY);
225 			break;
226 		case JSON_OBJECT:
227 			next_prefix = mlr_paste_2_strings(lrec_key, flatten_sep);
228 			if (!populate_from_nested_object(prec, pjson_value, next_prefix, flatten_sep, json_array_ingest))
229 				return FALSE;
230 			free(next_prefix);
231 			free(lrec_key);
232 			break;
233 		case JSON_ARRAY:
234 			switch (json_array_ingest) {
235 			case JSON_ARRAY_INGEST_FATAL:
236 				fprintf(stderr,
237 					"%s: found array item within JSON object. This is valid but unmillerable JSON.\n"
238 					"Use --json-skip-arrays-on-input to exclude these from input without fataling.\n"
239 					"Or, --json-map-arrays-on-input to convert them to integer-indexed maps.\n",
240 					MLR_GLOBALS.bargv0);
241 				return FALSE;
242 				break;
243 			case JSON_ARRAY_INGEST_AS_MAP:
244 				next_prefix = mlr_paste_2_strings(lrec_key, flatten_sep);
245 				if (!populate_from_nested_array(prec, pjson_value, next_prefix, flatten_sep, json_array_ingest)) {
246 					free(lrec_key);
247 					free(next_prefix);
248 					return FALSE;
249 				}
250 				free(lrec_key);
251 				free(next_prefix);
252 				break;
253 			// xxx other cases!
254 			default:
255 				free(lrec_key);
256 				break;
257 			}
258 			break;
259 
260 		case JSON_INTEGER:
261 			lrec_put(prec, lrec_key, pjson_value->u.integer.sval, FREE_ENTRY_KEY);
262 			break;
263 		case JSON_DOUBLE:
264 			lrec_put(prec, lrec_key, pjson_value->u.dbl.sval, FREE_ENTRY_KEY);
265 			break;
266 		default:
267 			MLR_INTERNAL_CODING_ERROR();
268 			break;
269 		}
270 
271 	}
272 	return TRUE;
273 }
274 
275 // ----------------------------------------------------------------
276 // * The buffer is an entire JSON blob, e.g. contents from stdio read; peof-psof is the file size so peof is one
277 //   byte *after* the last valid file byte.
278 // * The buffer is not assumed to be null-terminated.
279 // * Any lines beginning with comment_string are modified by poking space characters up to line_term.
mlr_json_strip_comments(char * psof,char * peof,comment_handling_t comment_handling,char * comment_string,char * line_term)280 void mlr_json_strip_comments(char* psof, char* peof, comment_handling_t comment_handling, char* comment_string, char* line_term) {
281 	int comment_string_len = strlen(comment_string);
282 	int line_term_len = strlen(line_term);
283 	int at_line_start = TRUE;
284 	for (char* p = psof; p < peof; /* increment in loop */) {
285 		if (streqn(p, line_term, line_term_len)) {
286 			p += line_term_len;
287 			at_line_start = TRUE;
288 		} else if (at_line_start && streqn(p, comment_string, comment_string_len)) {
289 			// Fill with spaces to end of line
290 			while (p < peof && !streqn(p, line_term, line_term_len)) {
291 				if (comment_handling == PASS_COMMENTS)
292 					fputc(*p, stdout);
293 				*p = ' ';
294 				p++;
295 			}
296 			if (comment_handling == PASS_COMMENTS)
297 				fputs(line_term, stdout);
298 			at_line_start = TRUE;
299 		} else {
300 			at_line_start = FALSE;
301 			p++;
302 		}
303 	}
304 }
305 
306 // ----------------------------------------------------------------
307 // I'm using a 3rd-party JSON parser and it's easy to strip all trailing whitespace
308 // than tweak the parser to handle those.
309 //
310 // peof is one past the last valid byte.
311 // pend is the last valid byte.
mlr_json_end_strip(char * psof,char ** ppeof)312 void mlr_json_end_strip(char* psof, char** ppeof) {
313 	char* pend = *ppeof - 1;
314 
315 	while (pend >= psof && (*pend == ' ' || *pend == '\t' || *pend == '\r' || *pend == '\n')) {
316 		pend--;
317 	}
318 
319 	*ppeof = pend + 1;
320 }
321