1 //
2 // $Id$
3 //
4 
5 //
6 // Copyright (c) 2011-2016, Andrew Aksyonoff
7 // Copyright (c) 2011-2016, Sphinx Technologies Inc
8 // All rights reserved
9 //
10 // This program is free software; you can redistribute it and/or modify
11 // it under the terms of the GNU General Public License. You should have
12 // received a copy of the GPL license along with this program; if you
13 // did not, you can find it at http://www.gnu.org/
14 //
15 
16 #include "sphinxjson.h"
17 #include "sphinxint.h"
18 
19 #if USE_WINDOWS
20 #include <io.h> // for isatty() in llsphinxjson.c
21 #endif
22 
23 //////////////////////////////////////////////////////////////////////////
24 
25 /// parser view on a generic node
26 struct JsonNode_t
27 {
28 	ESphJsonType	m_eType;		///< node type
29 	int64_t			m_iValue;		///< integer value, only used for JSON_INT32 and JSON_INT64
30 	double			m_fValue;		///< floating point value, only used for JSON_DOUBLE
31 	int				m_iStart;		///< string value, start index (inclusive) into m_pBuf, only used for JSON_STRING
32 	int				m_iEnd;			///< string value, end index (exclusive) into m_pBuf, only used for JSON_STRING
33 	int				m_iHandle;		///< subobject value, index into m_dNodes storage
34 	int				m_iKeyStart;	///< node name, start index (inclusive) into m_pBuf
35 	int				m_iKeyEnd;		///< node name, end index (exclusive) into m_pBuf
36 
JsonNode_tJsonNode_t37 	JsonNode_t ()
38 		: m_eType ( JSON_TOTAL )
39 	{}
40 };
41 #define YYSTYPE JsonNode_t
42 
43 // must be included after YYSTYPE declaration
44 class JsonParser_c;
45 
46 /// actually, JSON-to-SphinxBSON converter helper, but who cares
47 class JsonParser_c : ISphNoncopyable
48 {
49 public:
50 	void *				m_pScanner;
51 	const char *		m_pLastToken;
52 	CSphVector<BYTE> &	m_dBuffer;
53 	CSphString &		m_sError;
54 	bool				m_bAutoconv;
55 	bool				m_bToLowercase;
56 	char *				m_pBuf;
57 	CSphVector < CSphVector<JsonNode_t> >	m_dNodes;
58 	CSphVector<JsonNode_t>					m_dEmpty;
59 
60 public:
JsonParser_c(CSphVector<BYTE> & dBuffer,bool bAutoconv,bool bToLowercase,CSphString & sError)61 	JsonParser_c ( CSphVector<BYTE> & dBuffer, bool bAutoconv, bool bToLowercase, CSphString & sError )
62 		: m_pScanner ( NULL )
63 		, m_pLastToken ( NULL )
64 		, m_dBuffer ( dBuffer )
65 		, m_sError ( sError )
66 		, m_bAutoconv ( bAutoconv )
67 		, m_bToLowercase ( bToLowercase )
68 	{
69 		// reserve 4 bytes for Bloom mask
70 		StoreInt ( 0 );
71 	}
72 
73 protected:
BufAlloc(int iLen)74 	BYTE * BufAlloc ( int iLen )
75 	{
76 		int iPos = m_dBuffer.GetLength();
77 		m_dBuffer.Resize ( m_dBuffer.GetLength()+iLen );
78 		return m_dBuffer.Begin()+iPos;
79 	}
80 
StoreInt(int v)81 	void StoreInt ( int v )
82 	{
83 		BYTE * p = BufAlloc ( 4 );
84 		*p++ = BYTE(DWORD(v));
85 		*p++ = BYTE(DWORD(v) >> 8);
86 		*p++ = BYTE(DWORD(v) >> 16);
87 		*p++ = BYTE(DWORD(v) >> 24);
88 	}
89 
StoreBigint(int64_t v)90 	void StoreBigint ( int64_t v )
91 	{
92 		StoreInt ( (DWORD)( v & 0xffffffffUL ) );
93 		StoreInt ( (int)( v>>32 ) );
94 	}
95 
PackLen(DWORD v)96 	int PackLen ( DWORD v )
97 	{
98 		if ( v<=251 )
99 			return 1;
100 		else if ( v<65536 )
101 			return 3;
102 		else if ( v<16777216 )
103 			return 4;
104 		else
105 			return 5;
106 	}
107 
PackInt(DWORD v)108 	void PackInt ( DWORD v )
109 	{
110 		assert ( v<16777216 ); // strings over 16M bytes and arrays over 16M entries are not supported
111 		if ( v<252 )
112 		{
113 			m_dBuffer.Add ( BYTE(v) );
114 		} else if ( v<65536 )
115 		{
116 			m_dBuffer.Add ( 252 );
117 			m_dBuffer.Add ( BYTE ( v & 255 ) );
118 			m_dBuffer.Add ( BYTE ( v>>8 ) );
119 		} else
120 		{
121 			m_dBuffer.Add ( 253 );
122 			m_dBuffer.Add ( BYTE ( v & 255 ) );
123 			m_dBuffer.Add ( BYTE ( ( v>>8 ) & 255 ) );
124 			m_dBuffer.Add ( BYTE ( v>>16 ) );
125 		}
126 	}
127 
PackStr(const char * s,int iLen)128 	void PackStr ( const char * s, int iLen )
129 	{
130 		iLen = Min ( iLen, 0xffffff );
131 		PackInt ( iLen );
132 		if ( iLen )
133 		{
134 			BYTE * p = BufAlloc ( iLen );
135 			memcpy ( p, s, iLen );
136 		}
137 	}
138 
JsonUnescape(char ** pEscaped,int iLen)139 	int JsonUnescape ( char ** pEscaped, int iLen )
140 	{
141 		assert ( pEscaped );
142 		char * s = *pEscaped;
143 
144 		// skip heading and trailing quotes
145 		if ( ( s[0]=='\'' && s[iLen-1]=='\'' ) || ( s[0]=='"' && s[iLen-1]=='"' ) )
146 		{
147 			s++;
148 			iLen -= 2;
149 		}
150 
151 		char * sMax = s+iLen;
152 		char * d = s;
153 		char * pStart = d;
154 		char sBuf[8] = { 0 };
155 
156 		while ( s<sMax )
157 		{
158 			if ( s[0]=='\\' )
159 			{
160 				switch ( s[1] )
161 				{
162 				case 'b': *d++ = '\b'; break;
163 				case 'n': *d++ = '\n'; break;
164 				case 'r': *d++ = '\r'; break;
165 				case 't': *d++ = '\t'; break;
166 				case 'f': *d++ = '\f'; break; // formfeed (rfc 4627)
167 				case 'u':
168 					// convert 6-byte sequences \u four-hex-digits (rfc 4627) to UTF-8
169 					if ( s+6<=sMax && isxdigit ( s[2] ) && isxdigit ( s[3] ) && isxdigit ( s[4] ) && isxdigit ( s[5] ) )
170 					{
171 						memcpy ( sBuf, s+2, 4 );
172 						d += sphUTF8Encode ( (BYTE*)d, (int)strtol ( sBuf, NULL, 16 ) );
173 						s += 4;
174 					} else
175 						*d++ = s[1];
176 					break;
177 				default:
178 					*d++ = s[1];
179 				}
180 				s += 2;
181 			} else
182 				*d++ = *s++;
183 		}
184 
185 		*pEscaped = pStart;
186 		return d - pStart;
187 	}
188 
PackNodeStr(const JsonNode_t & tNode)189 	void PackNodeStr ( const JsonNode_t & tNode )
190 	{
191 		int iLen = tNode.m_iEnd-tNode.m_iStart;
192 		char *s = m_pBuf + tNode.m_iStart;
193 		iLen = JsonUnescape ( &s, iLen );
194 		PackStr ( s, iLen );
195 	}
196 
KeyUnescape(char ** ppKey,int iLen)197 	int KeyUnescape ( char ** ppKey, int iLen )
198 	{
199 		char * s = *ppKey;
200 		iLen = JsonUnescape ( &s, iLen );
201 		if ( m_bToLowercase )
202 			for ( int i=0; i<iLen; i++ )
203 				s[i] = (char)tolower ( s[i] ); // OPTIMIZE! not sure if significant, but known to be hell slow
204 		*ppKey = s;
205 		return iLen;
206 	}
207 
StoreMask(int iOfs,DWORD uMask)208 	void StoreMask ( int iOfs, DWORD uMask )
209 	{
210 		for ( int i=0; i<4; i++ )
211 		{
212 			m_dBuffer[iOfs+i] = BYTE ( uMask & 0xff );
213 			uMask >>= 8;
214 		}
215 	}
216 
217 	/// reserve a single byte for a yet-unknown length, to be written later with PackSize()
218 	/// returns its offset, to be used by PackSize() to both calculate and stored the length
ReserveSize()219 	int ReserveSize()
220 	{
221 		int iOfs = m_dBuffer.GetLength();
222 		m_dBuffer.Resize ( iOfs+1 );
223 		return iOfs;
224 	}
225 
226 	/// compute current length from the offset reserved with ReserveSize(), and pack the value back there
227 	/// in most cases that single byte is enough; if not, we make room by memmove()ing the data
PackSize(int iOfs)228 	void PackSize ( int iOfs )
229 	{
230 		int iSize = m_dBuffer.GetLength()-iOfs-1;
231 		int iPackLen = PackLen ( iSize );
232 
233 		if ( iPackLen!=1 )
234 		{
235 			m_dBuffer.Resize ( iOfs+iPackLen+iSize );
236 			memmove ( m_dBuffer.Begin()+iOfs+iPackLen, m_dBuffer.Begin()+iOfs+1, iSize );
237 		}
238 
239 		m_dBuffer.Resize ( iOfs );
240 		PackInt ( iSize );
241 		m_dBuffer.Resize ( iOfs+iPackLen+iSize );
242 	}
243 
244 public:
Finalize()245 	void Finalize()
246 	{
247 		m_dBuffer.Add ( JSON_EOF );
248 	}
249 
NumericFixup(JsonNode_t & tNode)250 	void NumericFixup ( JsonNode_t & tNode )
251 	{
252 		// auto-convert string values, if necessary
253 		if ( tNode.m_eType==JSON_STRING && m_bAutoconv )
254 			if ( !sphJsonStringToNumber ( m_pBuf+tNode.m_iStart+1, tNode.m_iEnd-tNode.m_iStart-2, tNode.m_eType, tNode.m_iValue, tNode.m_fValue ) )
255 				return;
256 
257 		// parser and converter emits int64 values, fix them up to int32
258 		if ( tNode.m_eType==JSON_INT64 )
259 		{
260 			int iVal = int(tNode.m_iValue);
261 			if ( tNode.m_iValue==int64_t(iVal) )
262 				tNode.m_eType = JSON_INT32;
263 		}
264 	}
265 
WriteNode(JsonNode_t & tNode,const char * sKey=NULL,int iKeyLen=0)266 	bool WriteNode ( JsonNode_t & tNode, const char * sKey=NULL, int iKeyLen=0 )
267 	{
268 		// convert int64 to int32, strings to numbers if needed
269 		NumericFixup ( tNode );
270 
271 		ESphJsonType eType = tNode.m_eType;
272 
273 		// note m_iHandle may be uninitialized on simple nodes
274 		CSphVector<JsonNode_t> & dNodes = ( ( eType==JSON_MIXED_VECTOR || eType==JSON_OBJECT ) && tNode.m_iHandle>=0 )
275 			? m_dNodes[ tNode.m_iHandle ]
276 			: m_dEmpty;
277 
278 		// process mixed vector, convert to generic vector if possible
279 		if ( eType==JSON_MIXED_VECTOR )
280 		{
281 			ARRAY_FOREACH ( i, dNodes )
282 				NumericFixup ( dNodes[i] );
283 
284 			ESphJsonType eBase = dNodes.GetLength()>0 ? dNodes[0].m_eType : JSON_EOF;
285 			bool bGeneric = ARRAY_ALL ( bGeneric, dNodes, dNodes[_all].m_eType==eBase );
286 
287 			if ( bGeneric )
288 				switch ( eBase )
289 			{
290 				case JSON_INT32:	eType = JSON_INT32_VECTOR; break;
291 				case JSON_INT64:	eType = JSON_INT64_VECTOR; break;
292 				case JSON_DOUBLE:	eType = JSON_DOUBLE_VECTOR; break;
293 				case JSON_STRING:	eType = JSON_STRING_VECTOR; break;
294 				default:			break; // type matches across all entries, but we do not have a special format for that type
295 			}
296 		}
297 
298 		// check for the root (bson v1), note sKey shouldn't be set
299 		if ( eType==JSON_OBJECT && m_dBuffer.GetLength()==4 && !sKey )
300 			eType = JSON_ROOT;
301 
302 		// write node type
303 		if ( eType!=JSON_ROOT )
304 			m_dBuffer.Add ( (BYTE)eType );
305 
306 		// write key if given
307 		if ( sKey )
308 			PackStr ( sKey, iKeyLen );
309 
310 		switch ( eType )
311 		{
312 		// basic types
313 		case JSON_INT32:	StoreInt ( (int)tNode.m_iValue ); break;
314 		case JSON_INT64:	StoreBigint ( tNode.m_iValue ); break;
315 		case JSON_DOUBLE:	StoreBigint ( sphD2QW ( tNode.m_fValue ) ); break;
316 		case JSON_STRING:	PackNodeStr ( tNode ); break;
317 
318 		// literals
319 		case JSON_TRUE:
320 		case JSON_FALSE:
321 		case JSON_NULL:
322 			// no content
323 			break;
324 
325 		// associative arrays
326 		case JSON_ROOT:
327 		case JSON_OBJECT:
328 			{
329 				DWORD uMask = 0;
330 				int iOfs = 0;
331 
332 				if ( eType==JSON_OBJECT )
333 				{
334 					iOfs = ReserveSize();
335 					StoreInt ( uMask );
336 				}
337 
338 				ARRAY_FOREACH ( i, dNodes )
339 				{
340 					char * sObjKey = m_pBuf + dNodes[i].m_iKeyStart;
341 					int iLen = KeyUnescape ( &sObjKey, dNodes[i].m_iKeyEnd-dNodes[i].m_iKeyStart );
342 					WriteNode ( dNodes[i], sObjKey, iLen );
343 					uMask |= sphJsonKeyMask ( sObjKey, iLen );
344 				}
345 				m_dBuffer.Add ( JSON_EOF );
346 
347 				if ( eType==JSON_OBJECT )
348 				{
349 					StoreMask ( iOfs+1, uMask );
350 					PackSize ( iOfs ); // MUST be in this order, because PackSize() might move the data!
351 				} else
352 				{
353 					assert ( eType==JSON_ROOT );
354 					StoreMask ( 0, uMask );
355 				}
356 				break;
357 			}
358 
359 		// mixed array
360 		case JSON_MIXED_VECTOR:
361 			{
362 				int iOfs = ReserveSize();
363 				PackInt ( dNodes.GetLength() );
364 				ARRAY_FOREACH ( i, dNodes )
365 					WriteNode ( dNodes[i] );
366 				PackSize ( iOfs );
367 				break;
368 			}
369 
370 		// optimized (generic) arrays
371 		case JSON_INT32_VECTOR:
372 			PackInt ( dNodes.GetLength() );
373 			ARRAY_FOREACH ( i, dNodes )
374 				StoreInt ( (int)dNodes[i].m_iValue );
375 			break;
376 		case JSON_INT64_VECTOR:
377 			PackInt ( dNodes.GetLength() );
378 			ARRAY_FOREACH ( i, dNodes )
379 				StoreBigint ( dNodes[i].m_iValue );
380 			break;
381 		case JSON_DOUBLE_VECTOR:
382 			PackInt ( dNodes.GetLength() );
383 			ARRAY_FOREACH ( i, dNodes )
384 				StoreBigint ( sphD2QW ( dNodes[i].m_fValue ) );
385 			break;
386 		case JSON_STRING_VECTOR:
387 			{
388 				int iOfs = ReserveSize();
389 				PackInt ( dNodes.GetLength() );
390 				ARRAY_FOREACH ( i, dNodes )
391 					PackNodeStr ( dNodes[i] );
392 				PackSize ( iOfs );
393 				break;
394 			}
395 		default:
396 			assert ( 0 && "internal error: unhandled type" );
397 			return false;
398 		}
399 		return true;
400 	}
401 
DebugIndent(int iLevel)402 	void DebugIndent ( int iLevel )
403 	{
404 		for ( int i=0; i<iLevel; i++ )
405 			printf ( "    " );
406 	}
407 
DebugDump(ESphJsonType eType,const BYTE ** ppData,int iLevel)408 	void DebugDump ( ESphJsonType eType, const BYTE ** ppData, int iLevel )
409 	{
410 		DebugIndent ( iLevel );
411 
412 		const BYTE * p = *ppData;
413 
414 		switch ( eType )
415 		{
416 		case JSON_INT32: printf ( "JSON_INT32 %d\n", sphJsonLoadInt ( &p ) ); break;
417 		case JSON_INT64: printf ( "JSON_INT64 " INT64_FMT "\n", sphJsonLoadBigint ( &p ) ); break;
418 		case JSON_DOUBLE: printf ( "JSON_DOUBLE %lf\n", sphQW2D ( sphJsonLoadBigint ( &p ) ) ); break;
419 		case JSON_STRING:
420 			{
421 				int iLen = sphJsonUnpackInt ( &p );
422 				CSphString sVal;
423 				sVal.SetBinary ( (const char*)p, iLen );
424 				printf ( "JSON_STRING \"%s\"\n", sVal.cstr() );
425 				p += iLen;
426 				break;
427 			}
428 
429 		case JSON_TRUE:		printf ( "JSON_TRUE\n" ); break;
430 		case JSON_FALSE:	printf ( "JSON_FALSE\n" ); break;
431 		case JSON_NULL:		printf ( "JSON_NULL\n" ); break;
432 		case JSON_EOF:		printf ( "JSON_EOF\n" ); break;
433 
434 		// associative arrays
435 		case JSON_ROOT:
436 		case JSON_OBJECT:
437 			{
438 				if ( eType==JSON_OBJECT )
439 					sphJsonUnpackInt ( &p );
440 
441 				DWORD uMask = sphGetDword(p);
442 				printf ( "%s (bloom mask: 0x%08x)\n", eType==JSON_OBJECT ? "JSON_OBJECT" : "JSON_ROOT", uMask );
443 				p += 4; // skip bloom table
444 				for ( ;; )
445 				{
446 					ESphJsonType eInnerType = (ESphJsonType) *p++;
447 					if ( eInnerType==JSON_EOF )
448 						break;
449 					const int iStrLen = sphJsonUnpackInt ( &p );
450 					CSphString sVal;
451 					sVal.SetBinary ( (const char*)p, iStrLen );
452 					DebugIndent ( iLevel+1 );
453 					printf ( "\"%s\"", sVal.cstr() );
454 					p += iStrLen;
455 					DebugDump ( eInnerType, &p, iLevel+1 );
456 				}
457 				break;
458 			}
459 
460 		case JSON_MIXED_VECTOR:
461 			{
462 				int iTotalLen = sphJsonUnpackInt ( &p );
463 				int iLen = sphJsonUnpackInt ( &p );
464 				printf ( "JSON_MIXED_VECTOR [%d] (%d bytes)\n", iLen, iTotalLen );
465 				for ( int i=0; i<iLen; i++ )
466 				{
467 					ESphJsonType eInnerType = (ESphJsonType)*p++;
468 					DebugDump ( eInnerType, &p, iLevel+1 );
469 				}
470 				break;
471 			}
472 
473 		// optimized arrays ( note they can't be empty )
474 		case JSON_STRING_VECTOR:
475 			{
476 				sphJsonUnpackInt ( &p );
477 				int iLen = sphJsonUnpackInt ( &p );
478 				printf ( "JSON_STRING_VECTOR (%d) [", iLen );
479 				for ( int i=0; i<iLen; i++ )
480 				{
481 					int iStrLen = sphJsonUnpackInt ( &p );
482 					CSphString sVal;
483 					sVal.SetBinary ( (const char*)p, iStrLen );
484 					printf ( "\"%s\"%s", sVal.cstr(), i<iLen-1 ? "," : "]\n" );
485 					p += iStrLen;
486 				}
487 			break;
488 			}
489 		case JSON_INT32_VECTOR:
490 			{
491 				int iLen = sphJsonUnpackInt ( &p );
492 				printf ( "JSON_INT32_VECTOR (%d) [", iLen );
493 				for ( int i=0; i<iLen; i++ )
494 					printf ( "%d%s", sphJsonLoadInt ( &p ), i<iLen-1 ? "," : "]\n" );
495 				break;
496 			}
497 		case JSON_INT64_VECTOR:
498 			{
499 				int iLen = sphJsonUnpackInt ( &p );
500 				printf ( "JSON_INT64_VECTOR (%d) [", iLen );
501 				for ( int i=0; i<iLen; i++ )
502 					printf ( INT64_FMT"%s", sphJsonLoadBigint ( &p ), i<iLen-1 ? "," : "]\n" );
503 				break;
504 			}
505 		case JSON_DOUBLE_VECTOR:
506 			{
507 				int iLen = sphJsonUnpackInt ( &p );
508 				printf ( "JSON_DOUBLE_VECTOR (%d) [", iLen );
509 				for ( int i=0; i<iLen; i++ )
510 					printf ( "%lf%s", sphQW2D ( sphJsonLoadBigint ( &p ) ), i<iLen-1 ? "," : "]\n" );
511 				break;
512 			}
513 
514 		default:
515 			printf ( "UNKNOWN\n" );
516 			break;
517 		}
518 		*ppData = p;
519 	}
520 
DebugDump(const BYTE * p)521 	void DebugDump ( const BYTE * p )
522 	{
523 		CSphVector<BYTE> dOut;
524 		sphJsonFormat ( dOut, m_dBuffer.Begin() );
525 		dOut.Add ( '\0' );
526 		printf ( "sphJsonFormat: %s\n", (char*)dOut.Begin() );
527 
528 		printf ( "Blob size: %d bytes\n", m_dBuffer.GetLength() );
529 		ESphJsonType eType = sphJsonFindFirst ( &p );
530 		DebugDump ( eType, &p, 0 );
531 		printf ( "\n" );
532 	}
533 };
534 
535 // unused parameter, simply to avoid type clash between all my yylex() functions
536 #define YY_NO_UNISTD_H 1
537 #define YY_DECL static int my_lex ( YYSTYPE * lvalp, void * yyscanner, JsonParser_c * pParser )
538 
539 #ifdef CMAKE_GENERATED_LEXER
540 	#include "flexsphinxjson.c"
541 #else
542 	#include "llsphinxjson.c"
543 #endif
544 
yyerror(JsonParser_c * pParser,const char * sMessage)545 void yyerror ( JsonParser_c * pParser, const char * sMessage )
546 {
547 	yy2lex_unhold ( pParser->m_pScanner );
548 	pParser->m_sError.SetSprintf ( "%s near '%s'", sMessage, pParser->m_pLastToken );
549 }
550 
yylex(YYSTYPE * lvalp,JsonParser_c * pParser)551 static int yylex ( YYSTYPE * lvalp, JsonParser_c * pParser )
552 {
553 	return my_lex ( lvalp, pParser->m_pScanner, pParser );
554 }
555 
556 #ifdef CMAKE_GENERATED_GRAMMAR
557 	#include "bissphinxjson.c"
558 #else
559 	#include "yysphinxjson.c"
560 #endif
561 
sphJsonParse(CSphVector<BYTE> & dData,char * sData,bool bAutoconv,bool bToLowercase,CSphString & sError)562 bool sphJsonParse ( CSphVector<BYTE> & dData, char * sData, bool bAutoconv, bool bToLowercase, CSphString & sError )
563 {
564 	int iLen = strlen ( sData );
565 	if ( sData[iLen+1]!=0 )
566 	{
567 		sError = "internal error: input data passed to sphJsonParse() must be terminated with a double zero";
568 		return false;
569 	}
570 
571 	JsonParser_c tParser ( dData, bAutoconv, bToLowercase, sError );
572 	yy2lex_init ( &tParser.m_pScanner );
573 
574 	tParser.m_pBuf = sData; // sphJsonParse() is intentionally destructive, no need to copy data here
575 
576 	YY_BUFFER_STATE tLexerBuffer = yy2_scan_buffer ( sData, iLen+2, tParser.m_pScanner );
577 	if ( !tLexerBuffer )
578 	{
579 		sError = "internal error: yy_scan_buffer() failed";
580 		return false;
581 	}
582 
583 	int iRes = yyparse ( &tParser );
584 	yy2_delete_buffer ( tLexerBuffer, tParser.m_pScanner );
585 	yy2lex_destroy ( tParser.m_pScanner );
586 
587 	tParser.Finalize();
588 
589 	if ( dData.GetSizeBytes() >= 0x400000 )
590 	{
591 		sError = "data exceeds 0x400000 bytes";
592 		iRes = -1;
593 	}
594 
595 	if ( iRes!=0 )
596 		dData.Reset();
597 
598 	return iRes==0;
599 }
600 
601 //////////////////////////////////////////////////////////////////////////
602 
sphJsonKeyMask(const char * sKey,int iLen)603 DWORD sphJsonKeyMask ( const char * sKey, int iLen )
604 {
605 	DWORD uCrc = sphCRC32 ( sKey, iLen );
606 	return
607 		( 1UL<<( uCrc & 31 ) ) +
608 		( 1UL<<( ( uCrc>>8 ) & 31 ) );
609 }
610 
611 
612 // returns -1 if size is unreachable (for remote agents)
sphJsonNodeSize(ESphJsonType eType,const BYTE * pData)613 int sphJsonNodeSize ( ESphJsonType eType, const BYTE *pData )
614 {
615 	int iLen;
616 	const BYTE * p = pData;
617 	switch ( eType )
618 	{
619 	case JSON_INT32:
620 		return 4;
621 	case JSON_INT64:
622 	case JSON_DOUBLE:
623 		return 8;
624 	case JSON_INT32_VECTOR:
625 		if ( !p )
626 			return -1;
627 		iLen = sphJsonUnpackInt ( &p );
628 		return p - pData + iLen * 4;
629 	case JSON_INT64_VECTOR:
630 	case JSON_DOUBLE_VECTOR:
631 		if ( !p )
632 			return -1;
633 		iLen = sphJsonUnpackInt ( &p );
634 		return p - pData + iLen * 8;
635 	case JSON_STRING:
636 	case JSON_STRING_VECTOR:
637 	case JSON_MIXED_VECTOR:
638 	case JSON_OBJECT:
639 		if ( !p )
640 			return -1;
641 		iLen = sphJsonUnpackInt ( &p );
642 		return p - pData + iLen;
643 	case JSON_ROOT:
644 		if ( !p )
645 			return -1;
646 		p += 4; // skip filter
647 		for ( ;; )
648 		{
649 			ESphJsonType eNode = (ESphJsonType) *p++;
650 			if ( eNode==JSON_EOF )
651 				break;
652 			// skip key and node
653 			iLen = sphJsonUnpackInt ( &p );
654 			p += iLen;
655 			sphJsonSkipNode ( eNode, &p );
656 		}
657 		return p - pData;
658 	default:
659 		return 0;
660 	}
661 }
662 
663 
sphJsonSkipNode(ESphJsonType eType,const BYTE ** ppData)664 void sphJsonSkipNode ( ESphJsonType eType, const BYTE ** ppData )
665 {
666 	int iSize = sphJsonNodeSize ( eType, *ppData );
667 	*ppData += iSize;
668 }
669 
670 
sphJsonFieldLength(ESphJsonType eType,const BYTE * pData)671 int sphJsonFieldLength ( ESphJsonType eType, const BYTE * pData )
672 {
673 	const BYTE * p = pData;
674 	int iCount = 0;
675 	switch ( eType )
676 	{
677 	case JSON_INT32:
678 	case JSON_INT64:
679 	case JSON_DOUBLE:
680 		return 1;
681 	case JSON_STRING_VECTOR:
682 	case JSON_MIXED_VECTOR:
683 		sphJsonUnpackInt ( &p );
684 		return sphJsonUnpackInt ( &p );
685 	case JSON_INT32_VECTOR:
686 	case JSON_INT64_VECTOR:
687 	case JSON_DOUBLE_VECTOR:
688 		return sphJsonUnpackInt ( &p );
689 	case JSON_OBJECT:
690 	case JSON_ROOT:
691 		if ( eType==JSON_OBJECT )
692 			sphJsonUnpackInt ( &p ); // skip size
693 		p += 4; // skip filter
694 		for ( ;; )
695 		{
696 			ESphJsonType eNode = (ESphJsonType) *p++;
697 			if ( eNode==JSON_EOF )
698 				break;
699 			int iLen = sphJsonUnpackInt ( &p );
700 			p += iLen;
701 			sphJsonSkipNode ( eNode, &p );
702 			iCount++;
703 		}
704 		return iCount;
705 	default:
706 		return 0;
707 	}
708 }
709 
710 
sphJsonFindFirst(const BYTE ** ppData)711 ESphJsonType sphJsonFindFirst ( const BYTE ** ppData )
712 {
713 	// non-zero bloom mask? that is JSON_ROOT (basically a JSON_OBJECT without node header)
714 	if ( sphGetDword(*ppData) )
715 		return JSON_ROOT;
716 
717 	// zero mask? must be followed by the type byte (typically JSON_EOF)
718 	ESphJsonType eType = (ESphJsonType)((*ppData)[4]);
719 	*ppData += 5;
720 	return eType;
721 }
722 
723 
sphJsonFindByKey(ESphJsonType eType,const BYTE ** ppValue,const void * pKey,int iLen,DWORD uMask)724 ESphJsonType sphJsonFindByKey ( ESphJsonType eType, const BYTE ** ppValue, const void * pKey, int iLen, DWORD uMask )
725 {
726 	if ( eType!=JSON_OBJECT && eType!=JSON_ROOT )
727 		return JSON_EOF;
728 
729 	const BYTE * p = *ppValue;
730 	if ( eType==JSON_OBJECT )
731 		sphJsonUnpackInt ( &p );
732 
733 	if ( ( sphGetDword(p) & uMask )!=uMask )
734 		return JSON_EOF;
735 
736 	p += 4;
737 	for ( ;; )
738 	{
739 		eType = (ESphJsonType) *p++;
740 		if ( eType==JSON_EOF )
741 			break;
742 		int iStrLen = sphJsonUnpackInt ( &p );
743 		p += iStrLen;
744 		if ( iStrLen==iLen && !memcmp ( p-iStrLen, pKey, iStrLen ) )
745 		{
746 			*ppValue = p;
747 			return eType;
748 		}
749 		sphJsonSkipNode ( eType, &p );
750 	}
751 
752 	return JSON_EOF;
753 }
754 
755 
sphJsonFindByIndex(ESphJsonType eType,const BYTE ** ppValue,int iIndex)756 ESphJsonType sphJsonFindByIndex ( ESphJsonType eType, const BYTE ** ppValue, int iIndex )
757 {
758 	if ( iIndex<0 )
759 		return JSON_EOF;
760 
761 	const BYTE * p = *ppValue;
762 	switch ( eType )
763 	{
764 	case JSON_INT32_VECTOR:
765 	case JSON_INT64_VECTOR:
766 	case JSON_DOUBLE_VECTOR:
767 		{
768 			int iLen = sphJsonUnpackInt ( &p );
769 			if ( iIndex>=iLen )
770 				return JSON_EOF;
771 			p += iIndex * ( eType==JSON_INT32_VECTOR ? 4 : 8 );
772 			*ppValue = p;
773 			return eType==JSON_INT32_VECTOR ? JSON_INT32
774 				: eType==JSON_INT64_VECTOR ? JSON_INT64
775 				: JSON_DOUBLE;
776 		}
777 	case JSON_STRING_VECTOR:
778 		{
779 			sphJsonUnpackInt ( &p );
780 			int iLen = sphJsonUnpackInt ( &p );
781 			if ( iIndex>=iLen )
782 				return JSON_EOF;
783 			for ( int i=0; i<iIndex; i++ )
784 			{
785 				int iStrLen = sphJsonUnpackInt ( &p );
786 				p += iStrLen;
787 			}
788 			*ppValue = p;
789 			return JSON_STRING;
790 		}
791 	case JSON_MIXED_VECTOR:
792 		{
793 			sphJsonUnpackInt ( &p );
794 			int iLen = sphJsonUnpackInt ( &p );
795 			if ( iIndex>=iLen )
796 				return JSON_EOF;
797 			for ( int i=0; i<iIndex; i++ )
798 			{
799 				eType = (ESphJsonType)*p++;
800 				sphJsonSkipNode ( eType, &p );
801 			}
802 			eType = (ESphJsonType)*p;
803 			*ppValue = p+1;
804 			return eType;
805 		}
806 	default:
807 		return JSON_EOF;
808 		break;
809 	}
810 }
811 
812 //////////////////////////////////////////////////////////////////////////
813 
JsonFormatStr(CSphVector<BYTE> & dOut,const BYTE * p,bool bQuote=true)814 static const BYTE * JsonFormatStr ( CSphVector<BYTE> & dOut, const BYTE * p, bool bQuote=true )
815 {
816 	int iLen = sphJsonUnpackInt ( &p );
817 	dOut.Reserve ( dOut.GetLength()+iLen );
818 	if ( bQuote )
819 		dOut.Add ( '"' );
820 	while ( iLen-- )
821 	{
822 		if ( bQuote )
823 		{
824 			switch ( *p )
825 			{
826 				case '\b': dOut.Add('\\'); dOut.Add('b'); break;
827 				case '\n': dOut.Add('\\'); dOut.Add('n'); break;
828 				case '\r': dOut.Add('\\'); dOut.Add('r'); break;
829 				case '\t': dOut.Add('\\'); dOut.Add('t'); break;
830 				case '\f': dOut.Add('\\'); dOut.Add('f'); break; // formfeed (rfc 4627)
831 				default:
832 					if ( *p == '"' || *p=='\\' || *p=='/' )
833 						dOut.Add ( '\\' );
834 					dOut.Add ( *p );
835 			}
836 		} else
837 			dOut.Add ( *p );
838 		p++;
839 	}
840 	if ( bQuote )
841 		dOut.Add ( '"' );
842 	return p;
843 }
844 
845 
JsonAddStr(CSphVector<BYTE> & dOut,const char * pStr)846 void JsonAddStr ( CSphVector<BYTE> & dOut, const char * pStr )
847 {
848 	while ( *pStr )
849 		dOut.Add ( *pStr++ );
850 }
851 
852 
sphJsonFormat(CSphVector<BYTE> & dOut,const BYTE * pData)853 void sphJsonFormat ( CSphVector<BYTE> & dOut, const BYTE * pData )
854 {
855 	if ( !pData )
856 		return;
857 	ESphJsonType eType = sphJsonFindFirst ( &pData );
858 
859 	// check for the empty root
860 	if ( eType==JSON_EOF )
861 	{
862 		JsonAddStr ( dOut, "{}" );
863 		return;
864 	}
865 
866 	sphJsonFieldFormat ( dOut, pData, eType );
867 }
868 
869 
sphJsonFieldFormat(CSphVector<BYTE> & dOut,const BYTE * pData,ESphJsonType eType,bool bQuoteString)870 const BYTE * sphJsonFieldFormat ( CSphVector<BYTE> & dOut, const BYTE * pData, ESphJsonType eType, bool bQuoteString )
871 {
872 	const BYTE * p = pData;
873 
874 	// format value
875 	switch ( eType )
876 	{
877 		case JSON_INT32:
878 		{
879 			int iOff = dOut.GetLength();
880 			dOut.Resize ( iOff+32 );
881 			int iLen = snprintf ( (char *)dOut.Begin()+iOff, 32, "%d", sphJsonLoadInt ( &p ) ); // NOLINT
882 			dOut.Resize ( iOff+iLen );
883 			break;
884 		}
885 		case JSON_INT64:
886 		{
887 			int iOff = dOut.GetLength();
888 			dOut.Resize ( iOff+32 );
889 			int iLen = snprintf ( (char *)dOut.Begin()+iOff, 32, INT64_FMT, sphJsonLoadBigint ( &p ) ); // NOLINT
890 			dOut.Resize ( iOff+iLen );
891 			break;
892 		}
893 		case JSON_DOUBLE:
894 		{
895 			int iOff = dOut.GetLength();
896 			dOut.Resize ( iOff+32 );
897 			int iLen = snprintf ( (char *)dOut.Begin()+iOff, 32, "%lf", sphQW2D ( sphJsonLoadBigint ( &p ) ) ); // NOLINT
898 			dOut.Resize ( iOff+iLen );
899 			break;
900 		}
901 		case JSON_STRING:
902 			p = JsonFormatStr ( dOut, p, bQuoteString );
903 			break;
904 		case JSON_STRING_VECTOR:
905 		{
906 			int iLen = sphJsonUnpackInt ( &p );
907 			dOut.Reserve ( dOut.GetLength()+iLen );
908 			int iVals = sphJsonUnpackInt ( &p );
909 			dOut.Add ( '[' );
910 			for ( int i=0; i<iVals; i++ )
911 			{
912 				if ( i>0 )
913 					dOut.Add ( ',' );
914 				p = JsonFormatStr ( dOut, p );
915 			}
916 			dOut.Add ( ']' );
917 			break;
918 		}
919 		case JSON_INT32_VECTOR:
920 		case JSON_INT64_VECTOR:
921 		case JSON_DOUBLE_VECTOR:
922 		{
923 			int iVals = sphJsonUnpackInt ( &p );
924 			dOut.Add ( '[' );
925 			for ( int i=0; i<iVals; i++ )
926 			{
927 				if ( i>0 )
928 					dOut.Add ( ',' );
929 				int iOff = dOut.GetLength();
930 				dOut.Resize ( iOff+32 );
931 				int iLen = 0;
932 				char * b = (char *)dOut.Begin()+iOff;
933 				switch ( eType )
934 				{
935 				case JSON_INT32_VECTOR: iLen = snprintf ( b, 32, "%d", sphJsonLoadInt ( &p ) ); break; // NOLINT
936 				case JSON_INT64_VECTOR: iLen = snprintf ( b, 32, INT64_FMT, sphJsonLoadBigint ( &p ) ); break; // NOLINT
937 				case JSON_DOUBLE_VECTOR: iLen = snprintf ( b, 32, "%lf", sphQW2D ( sphJsonLoadBigint ( &p ) ) ); break; // NOLINT
938 				default:
939 					break;
940 				}
941 				dOut.Resize ( iOff+iLen );
942 			}
943 			dOut.Add ( ']' );
944 			break;
945 		}
946 		case JSON_MIXED_VECTOR:
947 			{
948 				sphJsonUnpackInt ( &p );
949 				int iVals = sphJsonUnpackInt ( &p );
950 				dOut.Add ( '[' );
951 				for ( int i=0; i<iVals; i++ )
952 				{
953 					if ( i>0 )
954 						dOut.Add ( ',' );
955 					ESphJsonType eNode = (ESphJsonType) *p++;
956 					p = sphJsonFieldFormat ( dOut, p, eNode, true );
957 				}
958 				dOut.Add ( ']' );
959 				break;
960 			}
961 		case JSON_ROOT:
962 		case JSON_OBJECT:
963 			{
964 				if ( eType==JSON_OBJECT )
965 					sphJsonUnpackInt ( &p );
966 				p += 4; // skip bloom table
967 				dOut.Add ( '{' );
968 				for ( int i=0;;i++ )
969 				{
970 					ESphJsonType eNode = (ESphJsonType) *p++;
971 					if ( eNode==JSON_EOF )
972 						break;
973 					if ( i>0 )
974 						dOut.Add ( ',' );
975 					p = JsonFormatStr ( dOut, p );
976 					dOut.Add ( ':' );
977 					p = sphJsonFieldFormat ( dOut, p, eNode, true );
978 				}
979 				dOut.Add ( '}' );
980 				break;
981 			}
982 		case JSON_TRUE:		JsonAddStr ( dOut, bQuoteString ? "true" : "1" ); break;
983 		case JSON_FALSE:	JsonAddStr ( dOut, bQuoteString ? "false" : "0" ); break;
984 		case JSON_NULL:		JsonAddStr ( dOut, bQuoteString ? "null" : "" ); break;
985 		case JSON_EOF:		break;
986 		case JSON_TOTAL:	break;
987 	}
988 
989 	return p;
990 }
991 
992 
sphJsonNameSplit(const char * sName,CSphString * sColumn,CSphString * sKey)993 bool sphJsonNameSplit ( const char * sName, CSphString * sColumn, CSphString * sKey )
994 {
995 	if ( !sName )
996 		return false;
997 
998 	// find either '[' or '.', what comes first
999 	const char * pSep = sName;
1000 	while ( *pSep && *pSep!='.' && *pSep!='[' )
1001 	{
1002 		// check for invalid characters
1003 		if ( !sphIsAttr( *pSep ) && *pSep!=' ' )
1004 			return false;
1005 		pSep++;
1006 	}
1007 
1008 	if ( !*pSep )
1009 		return false;
1010 
1011 	int iSep = pSep - sName;
1012 	if ( sColumn )
1013 	{
1014 		sColumn->SetBinary ( sName, iSep );
1015 		sColumn->Trim();
1016 	}
1017 
1018 	if ( sKey )
1019 		*sKey = sName + iSep + ( *pSep=='.' ? 1 : 0 );
1020 
1021 	return true;
1022 }
1023 
1024 
JsonKey_t()1025 JsonKey_t::JsonKey_t ()
1026 	: m_uMask ( 0 )
1027 	, m_iLen ( 0 )
1028 {}
1029 
1030 
JsonKey_t(const char * sKey,int iLen)1031 JsonKey_t::JsonKey_t ( const char * sKey, int iLen )
1032 {
1033 	m_iLen = iLen;
1034 	m_uMask = sphJsonKeyMask ( sKey, m_iLen );
1035 	m_sKey.SetBinary ( sKey, m_iLen );
1036 }
1037 
1038 
JsonStoreInt(BYTE * p,int v)1039 void JsonStoreInt ( BYTE * p, int v )
1040 {
1041 	*p++ = BYTE(DWORD(v));
1042 	*p++ = BYTE(DWORD(v) >> 8);
1043 	*p++ = BYTE(DWORD(v) >> 16);
1044 	*p++ = BYTE(DWORD(v) >> 24);
1045 }
1046 
1047 
JsonStoreBigint(BYTE * p,int64_t v)1048 void JsonStoreBigint ( BYTE * p, int64_t v )
1049 {
1050 	JsonStoreInt ( p, (DWORD)( v & 0xffffffffUL ) );
1051 	JsonStoreInt ( p+4, (int)( v>>32 ) );
1052 }
1053 
1054 
sphJsonInplaceUpdate(ESphJsonType eValueType,int64_t iValue,ISphExpr * pExpr,BYTE * pStrings,const CSphRowitem * pRow,bool bUpdate)1055 bool sphJsonInplaceUpdate ( ESphJsonType eValueType, int64_t iValue, ISphExpr * pExpr, BYTE * pStrings, const CSphRowitem * pRow, bool bUpdate )
1056 {
1057 	if ( !pExpr || !pStrings )
1058 		return false;
1059 
1060 	pExpr->Command ( SPH_EXPR_SET_STRING_POOL, (void*)pStrings );
1061 
1062 	CSphMatch tMatch;
1063 	tMatch.m_pStatic = pRow;
1064 
1065 	uint64_t uPacked = pExpr->Int64Eval ( tMatch );
1066 	BYTE * pData = pStrings + ( uPacked & 0xffffffff );
1067 	ESphJsonType eType = (ESphJsonType)( uPacked >> 32 );
1068 
1069 	switch ( eType )
1070 	{
1071 	case JSON_INT32:
1072 		if ( eValueType==JSON_DOUBLE )
1073 			iValue = (int64_t)sphQW2D ( iValue );
1074 		if ( int64_t(int(iValue))!=iValue )
1075 			return false;
1076 		if ( bUpdate )
1077 			JsonStoreInt ( pData, (int)iValue );
1078 		break;
1079 	case JSON_INT64:
1080 		if ( bUpdate )
1081 			JsonStoreBigint ( pData, eValueType==JSON_DOUBLE ? (int64_t)sphQW2D ( iValue ) : iValue );
1082 		break;
1083 	case JSON_DOUBLE:
1084 		if ( bUpdate )
1085 			JsonStoreBigint ( pData, eValueType==JSON_DOUBLE ? iValue : sphD2QW ( (double)iValue ) );
1086 		break;
1087 	default:
1088 		return false;
1089 	}
1090 	return true;
1091 }
1092 
1093 
sphJsonStringToNumber(const char * s,int iLen,ESphJsonType & eType,int64_t & iVal,double & fVal)1094 bool sphJsonStringToNumber ( const char * s, int iLen, ESphJsonType & eType, int64_t & iVal, double & fVal )
1095 {
1096 	// skip whitespace
1097 	while ( iLen>0 && ( *s==' ' || *s=='\n' || *s=='\r' || *s=='\t' || *s=='\f' ) )
1098 		s++, iLen--;
1099 
1100 	if ( iLen<=0 )
1101 		return false;
1102 
1103 	// check whether the string looks like a numeric
1104 	const char * p = s;
1105 	const char * pEnd = p+iLen-1;
1106 	bool bNumeric = ( *p=='-' || *p=='.' || ( *p>='0' && *p<='9' ) );
1107 	bool bDot = ( *p=='.' );
1108 	bool bExp = false;
1109 	bool bExpSign = false;
1110 	while ( bNumeric && p<pEnd )
1111 	{
1112 		p++;
1113 		switch ( *p )
1114 		{
1115 		case '.':
1116 			if ( bDot )
1117 				bNumeric = false;
1118 			bDot = true;
1119 			break;
1120 		case 'e':
1121 		case 'E':
1122 			if ( bExp )
1123 				bNumeric = false;
1124 			bExp = true;
1125 			break;
1126 		case '-':
1127 		case '+':
1128 			if ( !bExp || bExpSign )
1129 				bNumeric = false;
1130 			bExpSign = true;
1131 			break;
1132 		default:
1133 			if ( *p<'0' || *p >'9' )
1134 				bNumeric = false;
1135 		}
1136 	}
1137 
1138 	// convert string to number
1139 	if ( bNumeric && iLen<32 )
1140 	{
1141 		char sVal[32];
1142 		memcpy ( sVal, s, iLen );
1143 		sVal[iLen] = '\0';
1144 		char * pCur;
1145 
1146 		// setting errno to zero is necessary because strtod/strtoll do not indicate
1147 		// whether it was an overflow or a valid input for borderline values
1148 		errno = 0;
1149 
1150 		if ( bDot || bExp )
1151 		{
1152 			double fRes = strtod ( sVal, &pCur );
1153 			if ( pCur==sVal+iLen && errno!=ERANGE )
1154 			{
1155 				eType = JSON_DOUBLE;
1156 				fVal = fRes;
1157 				return true;
1158 			}
1159 
1160 		} else
1161 		{
1162 			int64_t iRes = strtoll ( sVal, &pCur, 10 );
1163 			if ( pCur==sVal+iLen && errno!=ERANGE )
1164 			{
1165 				eType = JSON_INT64;
1166 				iVal = iRes;
1167 				return true;
1168 			}
1169 		}
1170 	}
1171 
1172 	return false;
1173 }
1174 
1175 //
1176 // $Id$
1177 //
1178