1 /*-------------------------------------------------------------------------
2  *
3  * jsonb.h
4  *	  Declarations for jsonb data type support.
5  *
6  * Copyright (c) 1996-2016, PostgreSQL Global Development Group
7  *
8  * src/include/utils/jsonb.h
9  *
10  *-------------------------------------------------------------------------
11  */
12 #ifndef __JSONB_H__
13 #define __JSONB_H__
14 
15 #include "lib/stringinfo.h"
16 #include "utils/array.h"
17 #include "utils/numeric.h"
18 
19 /* Tokens used when sequentially processing a jsonb value */
20 typedef enum
21 {
22 	WJB_DONE,
23 	WJB_KEY,
24 	WJB_VALUE,
25 	WJB_ELEM,
26 	WJB_BEGIN_ARRAY,
27 	WJB_END_ARRAY,
28 	WJB_BEGIN_OBJECT,
29 	WJB_END_OBJECT
30 } JsonbIteratorToken;
31 
32 /* Strategy numbers for GIN index opclasses */
33 #define JsonbContainsStrategyNumber		7
34 #define JsonbExistsStrategyNumber		9
35 #define JsonbExistsAnyStrategyNumber	10
36 #define JsonbExistsAllStrategyNumber	11
37 
38 /*
39  * In the standard jsonb_ops GIN opclass for jsonb, we choose to index both
40  * keys and values.  The storage format is text.  The first byte of the text
41  * string distinguishes whether this is a key (always a string), null value,
42  * boolean value, numeric value, or string value.  However, array elements
43  * that are strings are marked as though they were keys; this imprecision
44  * supports the definition of the "exists" operator, which treats array
45  * elements like keys.  The remainder of the text string is empty for a null
46  * value, "t" or "f" for a boolean value, a normalized print representation of
47  * a numeric value, or the text of a string value.  However, if the length of
48  * this text representation would exceed JGIN_MAXLENGTH bytes, we instead hash
49  * the text representation and store an 8-hex-digit representation of the
50  * uint32 hash value, marking the prefix byte with an additional bit to
51  * distinguish that this has happened.  Hashing long strings saves space and
52  * ensures that we won't overrun the maximum entry length for a GIN index.
53  * (But JGIN_MAXLENGTH is quite a bit shorter than GIN's limit.  It's chosen
54  * to ensure that the on-disk text datum will have a short varlena header.)
55  * Note that when any hashed item appears in a query, we must recheck index
56  * matches against the heap tuple; currently, this costs nothing because we
57  * must always recheck for other reasons.
58  */
59 #define JGINFLAG_KEY	0x01	/* key (or string array element) */
60 #define JGINFLAG_NULL	0x02	/* null value */
61 #define JGINFLAG_BOOL	0x03	/* boolean value */
62 #define JGINFLAG_NUM	0x04	/* numeric value */
63 #define JGINFLAG_STR	0x05	/* string value (if not an array element) */
64 #define JGINFLAG_HASHED 0x10	/* OR'd into flag if value was hashed */
65 #define JGIN_MAXLENGTH	125		/* max length of text part before hashing */
66 
67 /* Convenience macros */
68 #define DatumGetJsonb(d)	((Jsonb *) PG_DETOAST_DATUM(d))
69 #define JsonbGetDatum(p)	PointerGetDatum(p)
70 #define PG_GETARG_JSONB(x)	DatumGetJsonb(PG_GETARG_DATUM(x))
71 #define PG_RETURN_JSONB(x)	PG_RETURN_POINTER(x)
72 
73 typedef struct JsonbPair JsonbPair;
74 typedef struct JsonbValue JsonbValue;
75 
76 /*
77  * Jsonbs are varlena objects, so must meet the varlena convention that the
78  * first int32 of the object contains the total object size in bytes.  Be sure
79  * to use VARSIZE() and SET_VARSIZE() to access it, though!
80  *
81  * Jsonb is the on-disk representation, in contrast to the in-memory JsonbValue
82  * representation.  Often, JsonbValues are just shims through which a Jsonb
83  * buffer is accessed, but they can also be deep copied and passed around.
84  *
85  * Jsonb is a tree structure. Each node in the tree consists of a JEntry
86  * header and a variable-length content (possibly of zero size).  The JEntry
87  * header indicates what kind of a node it is, e.g. a string or an array,
88  * and provides the length of its variable-length portion.
89  *
90  * The JEntry and the content of a node are not stored physically together.
91  * Instead, the container array or object has an array that holds the JEntrys
92  * of all the child nodes, followed by their variable-length portions.
93  *
94  * The root node is an exception; it has no parent array or object that could
95  * hold its JEntry. Hence, no JEntry header is stored for the root node.  It
96  * is implicitly known that the root node must be an array or an object,
97  * so we can get away without the type indicator as long as we can distinguish
98  * the two.  For that purpose, both an array and an object begin with a uint32
99  * header field, which contains an JB_FOBJECT or JB_FARRAY flag.  When a naked
100  * scalar value needs to be stored as a Jsonb value, what we actually store is
101  * an array with one element, with the flags in the array's header field set
102  * to JB_FSCALAR | JB_FARRAY.
103  *
104  * Overall, the Jsonb struct requires 4-bytes alignment. Within the struct,
105  * the variable-length portion of some node types is aligned to a 4-byte
106  * boundary, while others are not. When alignment is needed, the padding is
107  * in the beginning of the node that requires it. For example, if a numeric
108  * node is stored after a string node, so that the numeric node begins at
109  * offset 3, the variable-length portion of the numeric node will begin with
110  * one padding byte so that the actual numeric data is 4-byte aligned.
111  */
112 
113 /*
114  * JEntry format.
115  *
116  * The least significant 28 bits store either the data length of the entry,
117  * or its end+1 offset from the start of the variable-length portion of the
118  * containing object.  The next three bits store the type of the entry, and
119  * the high-order bit tells whether the least significant bits store a length
120  * or an offset.
121  *
122  * The reason for the offset-or-length complication is to compromise between
123  * access speed and data compressibility.  In the initial design each JEntry
124  * always stored an offset, but this resulted in JEntry arrays with horrible
125  * compressibility properties, so that TOAST compression of a JSONB did not
126  * work well.  Storing only lengths would greatly improve compressibility,
127  * but it makes random access into large arrays expensive (O(N) not O(1)).
128  * So what we do is store an offset in every JB_OFFSET_STRIDE'th JEntry and
129  * a length in the rest.  This results in reasonably compressible data (as
130  * long as the stride isn't too small).  We may have to examine as many as
131  * JB_OFFSET_STRIDE JEntrys in order to find out the offset or length of any
132  * given item, but that's still O(1) no matter how large the container is.
133  *
134  * We could avoid eating a flag bit for this purpose if we were to store
135  * the stride in the container header, or if we were willing to treat the
136  * stride as an unchangeable constant.  Neither of those options is very
137  * attractive though.
138  */
139 typedef uint32 JEntry;
140 
141 #define JENTRY_OFFLENMASK		0x0FFFFFFF
142 #define JENTRY_TYPEMASK			0x70000000
143 #define JENTRY_HAS_OFF			0x80000000
144 
145 /* values stored in the type bits */
146 #define JENTRY_ISSTRING			0x00000000
147 #define JENTRY_ISNUMERIC		0x10000000
148 #define JENTRY_ISBOOL_FALSE		0x20000000
149 #define JENTRY_ISBOOL_TRUE		0x30000000
150 #define JENTRY_ISNULL			0x40000000
151 #define JENTRY_ISCONTAINER		0x50000000		/* array or object */
152 
153 /* Access macros.  Note possible multiple evaluations */
154 #define JBE_OFFLENFLD(je_)		((je_) & JENTRY_OFFLENMASK)
155 #define JBE_HAS_OFF(je_)		(((je_) & JENTRY_HAS_OFF) != 0)
156 #define JBE_ISSTRING(je_)		(((je_) & JENTRY_TYPEMASK) == JENTRY_ISSTRING)
157 #define JBE_ISNUMERIC(je_)		(((je_) & JENTRY_TYPEMASK) == JENTRY_ISNUMERIC)
158 #define JBE_ISCONTAINER(je_)	(((je_) & JENTRY_TYPEMASK) == JENTRY_ISCONTAINER)
159 #define JBE_ISNULL(je_)			(((je_) & JENTRY_TYPEMASK) == JENTRY_ISNULL)
160 #define JBE_ISBOOL_TRUE(je_)	(((je_) & JENTRY_TYPEMASK) == JENTRY_ISBOOL_TRUE)
161 #define JBE_ISBOOL_FALSE(je_)	(((je_) & JENTRY_TYPEMASK) == JENTRY_ISBOOL_FALSE)
162 #define JBE_ISBOOL(je_)			(JBE_ISBOOL_TRUE(je_) || JBE_ISBOOL_FALSE(je_))
163 
164 /* Macro for advancing an offset variable to the next JEntry */
165 #define JBE_ADVANCE_OFFSET(offset, je) \
166 	do { \
167 		JEntry	je_ = (je); \
168 		if (JBE_HAS_OFF(je_)) \
169 			(offset) = JBE_OFFLENFLD(je_); \
170 		else \
171 			(offset) += JBE_OFFLENFLD(je_); \
172 	} while(0)
173 
174 /*
175  * We store an offset, not a length, every JB_OFFSET_STRIDE children.
176  * Caution: this macro should only be referenced when creating a JSONB
177  * value.  When examining an existing value, pay attention to the HAS_OFF
178  * bits instead.  This allows changes in the offset-placement heuristic
179  * without breaking on-disk compatibility.
180  */
181 #define JB_OFFSET_STRIDE		32
182 
183 /*
184  * A jsonb array or object node, within a Jsonb Datum.
185  *
186  * An array has one child for each element, stored in array order.
187  *
188  * An object has two children for each key/value pair.  The keys all appear
189  * first, in key sort order; then the values appear, in an order matching the
190  * key order.  This arrangement keeps the keys compact in memory, making a
191  * search for a particular key more cache-friendly.
192  */
193 typedef struct JsonbContainer
194 {
195 	uint32		header;			/* number of elements or key/value pairs, and
196 								 * flags */
197 	JEntry		children[FLEXIBLE_ARRAY_MEMBER];
198 
199 	/* the data for each child node follows. */
200 } JsonbContainer;
201 
202 /* flags for the header-field in JsonbContainer */
203 #define JB_CMASK				0x0FFFFFFF		/* mask for count field */
204 #define JB_FSCALAR				0x10000000		/* flag bits */
205 #define JB_FOBJECT				0x20000000
206 #define JB_FARRAY				0x40000000
207 
208 /* The top-level on-disk format for a jsonb datum. */
209 typedef struct
210 {
211 	int32		vl_len_;		/* varlena header (do not touch directly!) */
212 	JsonbContainer root;
213 } Jsonb;
214 
215 /* convenience macros for accessing the root container in a Jsonb datum */
216 #define JB_ROOT_COUNT(jbp_)		( *(uint32*) VARDATA(jbp_) & JB_CMASK)
217 #define JB_ROOT_IS_SCALAR(jbp_) ( *(uint32*) VARDATA(jbp_) & JB_FSCALAR)
218 #define JB_ROOT_IS_OBJECT(jbp_) ( *(uint32*) VARDATA(jbp_) & JB_FOBJECT)
219 #define JB_ROOT_IS_ARRAY(jbp_)	( *(uint32*) VARDATA(jbp_) & JB_FARRAY)
220 
221 
222 /*
223  * JsonbValue:	In-memory representation of Jsonb.  This is a convenient
224  * deserialized representation, that can easily support using the "val"
225  * union across underlying types during manipulation.  The Jsonb on-disk
226  * representation has various alignment considerations.
227  */
228 struct JsonbValue
229 {
230 	enum
231 	{
232 		/* Scalar types */
233 		jbvNull = 0x0,
234 		jbvString,
235 		jbvNumeric,
236 		jbvBool,
237 		/* Composite types */
238 		jbvArray = 0x10,
239 		jbvObject,
240 		/* Binary (i.e. struct Jsonb) jbvArray/jbvObject */
241 		jbvBinary
242 	}			type;			/* Influences sort order */
243 
244 	union
245 	{
246 		Numeric numeric;
247 		bool		boolean;
248 		struct
249 		{
250 			int			len;
251 			char	   *val;	/* Not necessarily null-terminated */
252 		}			string;		/* String primitive type */
253 
254 		struct
255 		{
256 			int			nElems;
257 			JsonbValue *elems;
258 			bool		rawScalar;		/* Top-level "raw scalar" array? */
259 		}			array;		/* Array container type */
260 
261 		struct
262 		{
263 			int			nPairs; /* 1 pair, 2 elements */
264 			JsonbPair  *pairs;
265 		}			object;		/* Associative container type */
266 
267 		struct
268 		{
269 			int			len;
270 			JsonbContainer *data;
271 		}			binary;		/* Array or object, in on-disk format */
272 	}			val;
273 };
274 
275 #define IsAJsonbScalar(jsonbval)	((jsonbval)->type >= jbvNull && \
276 									 (jsonbval)->type <= jbvBool)
277 
278 /*
279  * Key/value pair within an Object.
280  *
281  * This struct type is only used briefly while constructing a Jsonb; it is
282  * *not* the on-disk representation.
283  *
284  * Pairs with duplicate keys are de-duplicated.  We store the originally
285  * observed pair ordering for the purpose of removing duplicates in a
286  * well-defined way (which is "last observed wins").
287  */
288 struct JsonbPair
289 {
290 	JsonbValue	key;			/* Must be a jbvString */
291 	JsonbValue	value;			/* May be of any type */
292 	uint32		order;			/* Pair's index in original sequence */
293 };
294 
295 /* Conversion state used when parsing Jsonb from text, or for type coercion */
296 typedef struct JsonbParseState
297 {
298 	JsonbValue	contVal;
299 	Size		size;
300 	struct JsonbParseState *next;
301 } JsonbParseState;
302 
303 /*
304  * JsonbIterator holds details of the type for each iteration. It also stores a
305  * Jsonb varlena buffer, which can be directly accessed in some contexts.
306  */
307 typedef enum
308 {
309 	JBI_ARRAY_START,
310 	JBI_ARRAY_ELEM,
311 	JBI_OBJECT_START,
312 	JBI_OBJECT_KEY,
313 	JBI_OBJECT_VALUE
314 } JsonbIterState;
315 
316 typedef struct JsonbIterator
317 {
318 	/* Container being iterated */
319 	JsonbContainer *container;
320 	uint32		nElems;			/* Number of elements in children array (will
321 								 * be nPairs for objects) */
322 	bool		isScalar;		/* Pseudo-array scalar value? */
323 	JEntry	   *children;		/* JEntrys for child nodes */
324 	/* Data proper.  This points to the beginning of the variable-length data */
325 	char	   *dataProper;
326 
327 	/* Current item in buffer (up to nElems) */
328 	int			curIndex;
329 
330 	/* Data offset corresponding to current item */
331 	uint32		curDataOffset;
332 
333 	/*
334 	 * If the container is an object, we want to return keys and values
335 	 * alternately; so curDataOffset points to the current key, and
336 	 * curValueOffset points to the current value.
337 	 */
338 	uint32		curValueOffset;
339 
340 	/* Private state */
341 	JsonbIterState state;
342 
343 	struct JsonbIterator *parent;
344 } JsonbIterator;
345 
346 /* I/O routines */
347 extern Datum jsonb_in(PG_FUNCTION_ARGS);
348 extern Datum jsonb_out(PG_FUNCTION_ARGS);
349 extern Datum jsonb_recv(PG_FUNCTION_ARGS);
350 extern Datum jsonb_send(PG_FUNCTION_ARGS);
351 extern Datum jsonb_typeof(PG_FUNCTION_ARGS);
352 
353 /* generator routines */
354 extern Datum to_jsonb(PG_FUNCTION_ARGS);
355 
356 extern Datum jsonb_build_object(PG_FUNCTION_ARGS);
357 extern Datum jsonb_build_object_noargs(PG_FUNCTION_ARGS);
358 extern Datum jsonb_build_array(PG_FUNCTION_ARGS);
359 extern Datum jsonb_build_array_noargs(PG_FUNCTION_ARGS);
360 extern Datum jsonb_object(PG_FUNCTION_ARGS);
361 extern Datum jsonb_object_two_arg(PG_FUNCTION_ARGS);
362 
363 /* jsonb_agg, json_object_agg functions */
364 extern Datum jsonb_agg_transfn(PG_FUNCTION_ARGS);
365 extern Datum jsonb_agg_finalfn(PG_FUNCTION_ARGS);
366 extern Datum jsonb_object_agg_transfn(PG_FUNCTION_ARGS);
367 extern Datum jsonb_object_agg_finalfn(PG_FUNCTION_ARGS);
368 
369 /* Indexing-related ops */
370 extern Datum jsonb_exists(PG_FUNCTION_ARGS);
371 extern Datum jsonb_exists_any(PG_FUNCTION_ARGS);
372 extern Datum jsonb_exists_all(PG_FUNCTION_ARGS);
373 extern Datum jsonb_contains(PG_FUNCTION_ARGS);
374 extern Datum jsonb_contained(PG_FUNCTION_ARGS);
375 extern Datum jsonb_ne(PG_FUNCTION_ARGS);
376 extern Datum jsonb_lt(PG_FUNCTION_ARGS);
377 extern Datum jsonb_gt(PG_FUNCTION_ARGS);
378 extern Datum jsonb_le(PG_FUNCTION_ARGS);
379 extern Datum jsonb_ge(PG_FUNCTION_ARGS);
380 extern Datum jsonb_eq(PG_FUNCTION_ARGS);
381 extern Datum jsonb_cmp(PG_FUNCTION_ARGS);
382 extern Datum jsonb_hash(PG_FUNCTION_ARGS);
383 
384 /* GIN support functions for jsonb_ops */
385 extern Datum gin_compare_jsonb(PG_FUNCTION_ARGS);
386 extern Datum gin_extract_jsonb(PG_FUNCTION_ARGS);
387 extern Datum gin_extract_jsonb_query(PG_FUNCTION_ARGS);
388 extern Datum gin_consistent_jsonb(PG_FUNCTION_ARGS);
389 extern Datum gin_triconsistent_jsonb(PG_FUNCTION_ARGS);
390 
391 /* GIN support functions for jsonb_path_ops */
392 extern Datum gin_extract_jsonb_path(PG_FUNCTION_ARGS);
393 extern Datum gin_extract_jsonb_query_path(PG_FUNCTION_ARGS);
394 extern Datum gin_consistent_jsonb_path(PG_FUNCTION_ARGS);
395 extern Datum gin_triconsistent_jsonb_path(PG_FUNCTION_ARGS);
396 
397 /* pretty printer, returns text */
398 extern Datum jsonb_pretty(PG_FUNCTION_ARGS);
399 
400 /* concatenation */
401 extern Datum jsonb_concat(PG_FUNCTION_ARGS);
402 
403 /* deletion */
404 extern Datum jsonb_delete(PG_FUNCTION_ARGS);
405 extern Datum jsonb_delete_idx(PG_FUNCTION_ARGS);
406 extern Datum jsonb_delete_path(PG_FUNCTION_ARGS);
407 
408 /* replacement */
409 extern Datum jsonb_set(PG_FUNCTION_ARGS);
410 
411 /* insert after or before (for arrays) */
412 extern Datum jsonb_insert(PG_FUNCTION_ARGS);
413 
414 /* Support functions */
415 extern uint32 getJsonbOffset(const JsonbContainer *jc, int index);
416 extern uint32 getJsonbLength(const JsonbContainer *jc, int index);
417 extern int	compareJsonbContainers(JsonbContainer *a, JsonbContainer *b);
418 extern JsonbValue *findJsonbValueFromContainer(JsonbContainer *sheader,
419 							uint32 flags,
420 							JsonbValue *key);
421 extern JsonbValue *getIthJsonbValueFromContainer(JsonbContainer *sheader,
422 							  uint32 i);
423 extern JsonbValue *pushJsonbValue(JsonbParseState **pstate,
424 			   JsonbIteratorToken seq, JsonbValue *jbVal);
425 extern JsonbIterator *JsonbIteratorInit(JsonbContainer *container);
426 extern JsonbIteratorToken JsonbIteratorNext(JsonbIterator **it, JsonbValue *val,
427 				  bool skipNested);
428 extern Jsonb *JsonbValueToJsonb(JsonbValue *val);
429 extern bool JsonbDeepContains(JsonbIterator **val,
430 				  JsonbIterator **mContained);
431 extern void JsonbHashScalarValue(const JsonbValue *scalarVal, uint32 *hash);
432 
433 /* jsonb.c support functions */
434 extern char *JsonbToCString(StringInfo out, JsonbContainer *in,
435 			   int estimated_len);
436 extern char *JsonbToCStringIndent(StringInfo out, JsonbContainer *in,
437 					 int estimated_len);
438 
439 
440 #endif   /* __JSONB_H__ */
441