1 //
2 // $Id$
3 //
4
5 //
6 // Copyright (c) 2011-2016, Andrew Aksyonoff
7 // Copyright (c) 2011-2016, Sphinx Technologies Inc
8 // All rights reserved
9 //
10 // This program is free software; you can redistribute it and/or modify
11 // it under the terms of the GNU General Public License. You should have
12 // received a copy of the GPL license along with this program; if you
13 // did not, you can find it at http://www.gnu.org/
14 //
15
16 #include "sphinxjson.h"
17 #include "sphinxint.h"
18
19 #if USE_WINDOWS
20 #include <io.h> // for isatty() in llsphinxjson.c
21 #endif
22
23 //////////////////////////////////////////////////////////////////////////
24
25 /// parser view on a generic node
26 struct JsonNode_t
27 {
28 ESphJsonType m_eType; ///< node type
29 int64_t m_iValue; ///< integer value, only used for JSON_INT32 and JSON_INT64
30 double m_fValue; ///< floating point value, only used for JSON_DOUBLE
31 int m_iStart; ///< string value, start index (inclusive) into m_pBuf, only used for JSON_STRING
32 int m_iEnd; ///< string value, end index (exclusive) into m_pBuf, only used for JSON_STRING
33 int m_iHandle; ///< subobject value, index into m_dNodes storage
34 int m_iKeyStart; ///< node name, start index (inclusive) into m_pBuf
35 int m_iKeyEnd; ///< node name, end index (exclusive) into m_pBuf
36
JsonNode_tJsonNode_t37 JsonNode_t ()
38 : m_eType ( JSON_TOTAL )
39 {}
40 };
41 #define YYSTYPE JsonNode_t
42
43 // must be included after YYSTYPE declaration
44 class JsonParser_c;
45
46 /// actually, JSON-to-SphinxBSON converter helper, but who cares
47 class JsonParser_c : ISphNoncopyable
48 {
49 public:
50 void * m_pScanner;
51 const char * m_pLastToken;
52 CSphVector<BYTE> & m_dBuffer;
53 CSphString & m_sError;
54 bool m_bAutoconv;
55 bool m_bToLowercase;
56 char * m_pBuf;
57 CSphVector < CSphVector<JsonNode_t> > m_dNodes;
58 CSphVector<JsonNode_t> m_dEmpty;
59
60 public:
JsonParser_c(CSphVector<BYTE> & dBuffer,bool bAutoconv,bool bToLowercase,CSphString & sError)61 JsonParser_c ( CSphVector<BYTE> & dBuffer, bool bAutoconv, bool bToLowercase, CSphString & sError )
62 : m_pScanner ( NULL )
63 , m_pLastToken ( NULL )
64 , m_dBuffer ( dBuffer )
65 , m_sError ( sError )
66 , m_bAutoconv ( bAutoconv )
67 , m_bToLowercase ( bToLowercase )
68 {
69 // reserve 4 bytes for Bloom mask
70 StoreInt ( 0 );
71 }
72
73 protected:
BufAlloc(int iLen)74 BYTE * BufAlloc ( int iLen )
75 {
76 int iPos = m_dBuffer.GetLength();
77 m_dBuffer.Resize ( m_dBuffer.GetLength()+iLen );
78 return m_dBuffer.Begin()+iPos;
79 }
80
StoreInt(int v)81 void StoreInt ( int v )
82 {
83 BYTE * p = BufAlloc ( 4 );
84 *p++ = BYTE(DWORD(v));
85 *p++ = BYTE(DWORD(v) >> 8);
86 *p++ = BYTE(DWORD(v) >> 16);
87 *p++ = BYTE(DWORD(v) >> 24);
88 }
89
StoreBigint(int64_t v)90 void StoreBigint ( int64_t v )
91 {
92 StoreInt ( (DWORD)( v & 0xffffffffUL ) );
93 StoreInt ( (int)( v>>32 ) );
94 }
95
PackLen(DWORD v)96 int PackLen ( DWORD v )
97 {
98 if ( v<=251 )
99 return 1;
100 else if ( v<65536 )
101 return 3;
102 else if ( v<16777216 )
103 return 4;
104 else
105 return 5;
106 }
107
PackInt(DWORD v)108 void PackInt ( DWORD v )
109 {
110 assert ( v<16777216 ); // strings over 16M bytes and arrays over 16M entries are not supported
111 if ( v<252 )
112 {
113 m_dBuffer.Add ( BYTE(v) );
114 } else if ( v<65536 )
115 {
116 m_dBuffer.Add ( 252 );
117 m_dBuffer.Add ( BYTE ( v & 255 ) );
118 m_dBuffer.Add ( BYTE ( v>>8 ) );
119 } else
120 {
121 m_dBuffer.Add ( 253 );
122 m_dBuffer.Add ( BYTE ( v & 255 ) );
123 m_dBuffer.Add ( BYTE ( ( v>>8 ) & 255 ) );
124 m_dBuffer.Add ( BYTE ( v>>16 ) );
125 }
126 }
127
PackStr(const char * s,int iLen)128 void PackStr ( const char * s, int iLen )
129 {
130 iLen = Min ( iLen, 0xffffff );
131 PackInt ( iLen );
132 if ( iLen )
133 {
134 BYTE * p = BufAlloc ( iLen );
135 memcpy ( p, s, iLen );
136 }
137 }
138
JsonUnescape(char ** pEscaped,int iLen)139 int JsonUnescape ( char ** pEscaped, int iLen )
140 {
141 assert ( pEscaped );
142 char * s = *pEscaped;
143
144 // skip heading and trailing quotes
145 if ( ( s[0]=='\'' && s[iLen-1]=='\'' ) || ( s[0]=='"' && s[iLen-1]=='"' ) )
146 {
147 s++;
148 iLen -= 2;
149 }
150
151 char * sMax = s+iLen;
152 char * d = s;
153 char * pStart = d;
154 char sBuf[8] = { 0 };
155
156 while ( s<sMax )
157 {
158 if ( s[0]=='\\' )
159 {
160 switch ( s[1] )
161 {
162 case 'b': *d++ = '\b'; break;
163 case 'n': *d++ = '\n'; break;
164 case 'r': *d++ = '\r'; break;
165 case 't': *d++ = '\t'; break;
166 case 'f': *d++ = '\f'; break; // formfeed (rfc 4627)
167 case 'u':
168 // convert 6-byte sequences \u four-hex-digits (rfc 4627) to UTF-8
169 if ( s+6<=sMax && isxdigit ( s[2] ) && isxdigit ( s[3] ) && isxdigit ( s[4] ) && isxdigit ( s[5] ) )
170 {
171 memcpy ( sBuf, s+2, 4 );
172 d += sphUTF8Encode ( (BYTE*)d, (int)strtol ( sBuf, NULL, 16 ) );
173 s += 4;
174 } else
175 *d++ = s[1];
176 break;
177 default:
178 *d++ = s[1];
179 }
180 s += 2;
181 } else
182 *d++ = *s++;
183 }
184
185 *pEscaped = pStart;
186 return d - pStart;
187 }
188
PackNodeStr(const JsonNode_t & tNode)189 void PackNodeStr ( const JsonNode_t & tNode )
190 {
191 int iLen = tNode.m_iEnd-tNode.m_iStart;
192 char *s = m_pBuf + tNode.m_iStart;
193 iLen = JsonUnescape ( &s, iLen );
194 PackStr ( s, iLen );
195 }
196
KeyUnescape(char ** ppKey,int iLen)197 int KeyUnescape ( char ** ppKey, int iLen )
198 {
199 char * s = *ppKey;
200 iLen = JsonUnescape ( &s, iLen );
201 if ( m_bToLowercase )
202 for ( int i=0; i<iLen; i++ )
203 s[i] = (char)tolower ( s[i] ); // OPTIMIZE! not sure if significant, but known to be hell slow
204 *ppKey = s;
205 return iLen;
206 }
207
StoreMask(int iOfs,DWORD uMask)208 void StoreMask ( int iOfs, DWORD uMask )
209 {
210 for ( int i=0; i<4; i++ )
211 {
212 m_dBuffer[iOfs+i] = BYTE ( uMask & 0xff );
213 uMask >>= 8;
214 }
215 }
216
217 /// reserve a single byte for a yet-unknown length, to be written later with PackSize()
218 /// returns its offset, to be used by PackSize() to both calculate and stored the length
ReserveSize()219 int ReserveSize()
220 {
221 int iOfs = m_dBuffer.GetLength();
222 m_dBuffer.Resize ( iOfs+1 );
223 return iOfs;
224 }
225
226 /// compute current length from the offset reserved with ReserveSize(), and pack the value back there
227 /// in most cases that single byte is enough; if not, we make room by memmove()ing the data
PackSize(int iOfs)228 void PackSize ( int iOfs )
229 {
230 int iSize = m_dBuffer.GetLength()-iOfs-1;
231 int iPackLen = PackLen ( iSize );
232
233 if ( iPackLen!=1 )
234 {
235 m_dBuffer.Resize ( iOfs+iPackLen+iSize );
236 memmove ( m_dBuffer.Begin()+iOfs+iPackLen, m_dBuffer.Begin()+iOfs+1, iSize );
237 }
238
239 m_dBuffer.Resize ( iOfs );
240 PackInt ( iSize );
241 m_dBuffer.Resize ( iOfs+iPackLen+iSize );
242 }
243
244 public:
Finalize()245 void Finalize()
246 {
247 m_dBuffer.Add ( JSON_EOF );
248 }
249
NumericFixup(JsonNode_t & tNode)250 void NumericFixup ( JsonNode_t & tNode )
251 {
252 // auto-convert string values, if necessary
253 if ( tNode.m_eType==JSON_STRING && m_bAutoconv )
254 if ( !sphJsonStringToNumber ( m_pBuf+tNode.m_iStart+1, tNode.m_iEnd-tNode.m_iStart-2, tNode.m_eType, tNode.m_iValue, tNode.m_fValue ) )
255 return;
256
257 // parser and converter emits int64 values, fix them up to int32
258 if ( tNode.m_eType==JSON_INT64 )
259 {
260 int iVal = int(tNode.m_iValue);
261 if ( tNode.m_iValue==int64_t(iVal) )
262 tNode.m_eType = JSON_INT32;
263 }
264 }
265
WriteNode(JsonNode_t & tNode,const char * sKey=NULL,int iKeyLen=0)266 bool WriteNode ( JsonNode_t & tNode, const char * sKey=NULL, int iKeyLen=0 )
267 {
268 // convert int64 to int32, strings to numbers if needed
269 NumericFixup ( tNode );
270
271 ESphJsonType eType = tNode.m_eType;
272
273 // note m_iHandle may be uninitialized on simple nodes
274 CSphVector<JsonNode_t> & dNodes = ( ( eType==JSON_MIXED_VECTOR || eType==JSON_OBJECT ) && tNode.m_iHandle>=0 )
275 ? m_dNodes[ tNode.m_iHandle ]
276 : m_dEmpty;
277
278 // process mixed vector, convert to generic vector if possible
279 if ( eType==JSON_MIXED_VECTOR )
280 {
281 ARRAY_FOREACH ( i, dNodes )
282 NumericFixup ( dNodes[i] );
283
284 ESphJsonType eBase = dNodes.GetLength()>0 ? dNodes[0].m_eType : JSON_EOF;
285 bool bGeneric = ARRAY_ALL ( bGeneric, dNodes, dNodes[_all].m_eType==eBase );
286
287 if ( bGeneric )
288 switch ( eBase )
289 {
290 case JSON_INT32: eType = JSON_INT32_VECTOR; break;
291 case JSON_INT64: eType = JSON_INT64_VECTOR; break;
292 case JSON_DOUBLE: eType = JSON_DOUBLE_VECTOR; break;
293 case JSON_STRING: eType = JSON_STRING_VECTOR; break;
294 default: break; // type matches across all entries, but we do not have a special format for that type
295 }
296 }
297
298 // check for the root (bson v1), note sKey shouldn't be set
299 if ( eType==JSON_OBJECT && m_dBuffer.GetLength()==4 && !sKey )
300 eType = JSON_ROOT;
301
302 // write node type
303 if ( eType!=JSON_ROOT )
304 m_dBuffer.Add ( (BYTE)eType );
305
306 // write key if given
307 if ( sKey )
308 PackStr ( sKey, iKeyLen );
309
310 switch ( eType )
311 {
312 // basic types
313 case JSON_INT32: StoreInt ( (int)tNode.m_iValue ); break;
314 case JSON_INT64: StoreBigint ( tNode.m_iValue ); break;
315 case JSON_DOUBLE: StoreBigint ( sphD2QW ( tNode.m_fValue ) ); break;
316 case JSON_STRING: PackNodeStr ( tNode ); break;
317
318 // literals
319 case JSON_TRUE:
320 case JSON_FALSE:
321 case JSON_NULL:
322 // no content
323 break;
324
325 // associative arrays
326 case JSON_ROOT:
327 case JSON_OBJECT:
328 {
329 DWORD uMask = 0;
330 int iOfs = 0;
331
332 if ( eType==JSON_OBJECT )
333 {
334 iOfs = ReserveSize();
335 StoreInt ( uMask );
336 }
337
338 ARRAY_FOREACH ( i, dNodes )
339 {
340 char * sObjKey = m_pBuf + dNodes[i].m_iKeyStart;
341 int iLen = KeyUnescape ( &sObjKey, dNodes[i].m_iKeyEnd-dNodes[i].m_iKeyStart );
342 WriteNode ( dNodes[i], sObjKey, iLen );
343 uMask |= sphJsonKeyMask ( sObjKey, iLen );
344 }
345 m_dBuffer.Add ( JSON_EOF );
346
347 if ( eType==JSON_OBJECT )
348 {
349 StoreMask ( iOfs+1, uMask );
350 PackSize ( iOfs ); // MUST be in this order, because PackSize() might move the data!
351 } else
352 {
353 assert ( eType==JSON_ROOT );
354 StoreMask ( 0, uMask );
355 }
356 break;
357 }
358
359 // mixed array
360 case JSON_MIXED_VECTOR:
361 {
362 int iOfs = ReserveSize();
363 PackInt ( dNodes.GetLength() );
364 ARRAY_FOREACH ( i, dNodes )
365 WriteNode ( dNodes[i] );
366 PackSize ( iOfs );
367 break;
368 }
369
370 // optimized (generic) arrays
371 case JSON_INT32_VECTOR:
372 PackInt ( dNodes.GetLength() );
373 ARRAY_FOREACH ( i, dNodes )
374 StoreInt ( (int)dNodes[i].m_iValue );
375 break;
376 case JSON_INT64_VECTOR:
377 PackInt ( dNodes.GetLength() );
378 ARRAY_FOREACH ( i, dNodes )
379 StoreBigint ( dNodes[i].m_iValue );
380 break;
381 case JSON_DOUBLE_VECTOR:
382 PackInt ( dNodes.GetLength() );
383 ARRAY_FOREACH ( i, dNodes )
384 StoreBigint ( sphD2QW ( dNodes[i].m_fValue ) );
385 break;
386 case JSON_STRING_VECTOR:
387 {
388 int iOfs = ReserveSize();
389 PackInt ( dNodes.GetLength() );
390 ARRAY_FOREACH ( i, dNodes )
391 PackNodeStr ( dNodes[i] );
392 PackSize ( iOfs );
393 break;
394 }
395 default:
396 assert ( 0 && "internal error: unhandled type" );
397 return false;
398 }
399 return true;
400 }
401
DebugIndent(int iLevel)402 void DebugIndent ( int iLevel )
403 {
404 for ( int i=0; i<iLevel; i++ )
405 printf ( " " );
406 }
407
DebugDump(ESphJsonType eType,const BYTE ** ppData,int iLevel)408 void DebugDump ( ESphJsonType eType, const BYTE ** ppData, int iLevel )
409 {
410 DebugIndent ( iLevel );
411
412 const BYTE * p = *ppData;
413
414 switch ( eType )
415 {
416 case JSON_INT32: printf ( "JSON_INT32 %d\n", sphJsonLoadInt ( &p ) ); break;
417 case JSON_INT64: printf ( "JSON_INT64 " INT64_FMT "\n", sphJsonLoadBigint ( &p ) ); break;
418 case JSON_DOUBLE: printf ( "JSON_DOUBLE %lf\n", sphQW2D ( sphJsonLoadBigint ( &p ) ) ); break;
419 case JSON_STRING:
420 {
421 int iLen = sphJsonUnpackInt ( &p );
422 CSphString sVal;
423 sVal.SetBinary ( (const char*)p, iLen );
424 printf ( "JSON_STRING \"%s\"\n", sVal.cstr() );
425 p += iLen;
426 break;
427 }
428
429 case JSON_TRUE: printf ( "JSON_TRUE\n" ); break;
430 case JSON_FALSE: printf ( "JSON_FALSE\n" ); break;
431 case JSON_NULL: printf ( "JSON_NULL\n" ); break;
432 case JSON_EOF: printf ( "JSON_EOF\n" ); break;
433
434 // associative arrays
435 case JSON_ROOT:
436 case JSON_OBJECT:
437 {
438 if ( eType==JSON_OBJECT )
439 sphJsonUnpackInt ( &p );
440
441 DWORD uMask = sphGetDword(p);
442 printf ( "%s (bloom mask: 0x%08x)\n", eType==JSON_OBJECT ? "JSON_OBJECT" : "JSON_ROOT", uMask );
443 p += 4; // skip bloom table
444 for ( ;; )
445 {
446 ESphJsonType eInnerType = (ESphJsonType) *p++;
447 if ( eInnerType==JSON_EOF )
448 break;
449 const int iStrLen = sphJsonUnpackInt ( &p );
450 CSphString sVal;
451 sVal.SetBinary ( (const char*)p, iStrLen );
452 DebugIndent ( iLevel+1 );
453 printf ( "\"%s\"", sVal.cstr() );
454 p += iStrLen;
455 DebugDump ( eInnerType, &p, iLevel+1 );
456 }
457 break;
458 }
459
460 case JSON_MIXED_VECTOR:
461 {
462 int iTotalLen = sphJsonUnpackInt ( &p );
463 int iLen = sphJsonUnpackInt ( &p );
464 printf ( "JSON_MIXED_VECTOR [%d] (%d bytes)\n", iLen, iTotalLen );
465 for ( int i=0; i<iLen; i++ )
466 {
467 ESphJsonType eInnerType = (ESphJsonType)*p++;
468 DebugDump ( eInnerType, &p, iLevel+1 );
469 }
470 break;
471 }
472
473 // optimized arrays ( note they can't be empty )
474 case JSON_STRING_VECTOR:
475 {
476 sphJsonUnpackInt ( &p );
477 int iLen = sphJsonUnpackInt ( &p );
478 printf ( "JSON_STRING_VECTOR (%d) [", iLen );
479 for ( int i=0; i<iLen; i++ )
480 {
481 int iStrLen = sphJsonUnpackInt ( &p );
482 CSphString sVal;
483 sVal.SetBinary ( (const char*)p, iStrLen );
484 printf ( "\"%s\"%s", sVal.cstr(), i<iLen-1 ? "," : "]\n" );
485 p += iStrLen;
486 }
487 break;
488 }
489 case JSON_INT32_VECTOR:
490 {
491 int iLen = sphJsonUnpackInt ( &p );
492 printf ( "JSON_INT32_VECTOR (%d) [", iLen );
493 for ( int i=0; i<iLen; i++ )
494 printf ( "%d%s", sphJsonLoadInt ( &p ), i<iLen-1 ? "," : "]\n" );
495 break;
496 }
497 case JSON_INT64_VECTOR:
498 {
499 int iLen = sphJsonUnpackInt ( &p );
500 printf ( "JSON_INT64_VECTOR (%d) [", iLen );
501 for ( int i=0; i<iLen; i++ )
502 printf ( INT64_FMT"%s", sphJsonLoadBigint ( &p ), i<iLen-1 ? "," : "]\n" );
503 break;
504 }
505 case JSON_DOUBLE_VECTOR:
506 {
507 int iLen = sphJsonUnpackInt ( &p );
508 printf ( "JSON_DOUBLE_VECTOR (%d) [", iLen );
509 for ( int i=0; i<iLen; i++ )
510 printf ( "%lf%s", sphQW2D ( sphJsonLoadBigint ( &p ) ), i<iLen-1 ? "," : "]\n" );
511 break;
512 }
513
514 default:
515 printf ( "UNKNOWN\n" );
516 break;
517 }
518 *ppData = p;
519 }
520
DebugDump(const BYTE * p)521 void DebugDump ( const BYTE * p )
522 {
523 CSphVector<BYTE> dOut;
524 sphJsonFormat ( dOut, m_dBuffer.Begin() );
525 dOut.Add ( '\0' );
526 printf ( "sphJsonFormat: %s\n", (char*)dOut.Begin() );
527
528 printf ( "Blob size: %d bytes\n", m_dBuffer.GetLength() );
529 ESphJsonType eType = sphJsonFindFirst ( &p );
530 DebugDump ( eType, &p, 0 );
531 printf ( "\n" );
532 }
533 };
534
535 // unused parameter, simply to avoid type clash between all my yylex() functions
536 #define YY_NO_UNISTD_H 1
537 #define YY_DECL static int my_lex ( YYSTYPE * lvalp, void * yyscanner, JsonParser_c * pParser )
538
539 #ifdef CMAKE_GENERATED_LEXER
540 #include "flexsphinxjson.c"
541 #else
542 #include "llsphinxjson.c"
543 #endif
544
yyerror(JsonParser_c * pParser,const char * sMessage)545 void yyerror ( JsonParser_c * pParser, const char * sMessage )
546 {
547 yy2lex_unhold ( pParser->m_pScanner );
548 pParser->m_sError.SetSprintf ( "%s near '%s'", sMessage, pParser->m_pLastToken );
549 }
550
yylex(YYSTYPE * lvalp,JsonParser_c * pParser)551 static int yylex ( YYSTYPE * lvalp, JsonParser_c * pParser )
552 {
553 return my_lex ( lvalp, pParser->m_pScanner, pParser );
554 }
555
556 #ifdef CMAKE_GENERATED_GRAMMAR
557 #include "bissphinxjson.c"
558 #else
559 #include "yysphinxjson.c"
560 #endif
561
sphJsonParse(CSphVector<BYTE> & dData,char * sData,bool bAutoconv,bool bToLowercase,CSphString & sError)562 bool sphJsonParse ( CSphVector<BYTE> & dData, char * sData, bool bAutoconv, bool bToLowercase, CSphString & sError )
563 {
564 int iLen = strlen ( sData );
565 if ( sData[iLen+1]!=0 )
566 {
567 sError = "internal error: input data passed to sphJsonParse() must be terminated with a double zero";
568 return false;
569 }
570
571 JsonParser_c tParser ( dData, bAutoconv, bToLowercase, sError );
572 yy2lex_init ( &tParser.m_pScanner );
573
574 tParser.m_pBuf = sData; // sphJsonParse() is intentionally destructive, no need to copy data here
575
576 YY_BUFFER_STATE tLexerBuffer = yy2_scan_buffer ( sData, iLen+2, tParser.m_pScanner );
577 if ( !tLexerBuffer )
578 {
579 sError = "internal error: yy_scan_buffer() failed";
580 return false;
581 }
582
583 int iRes = yyparse ( &tParser );
584 yy2_delete_buffer ( tLexerBuffer, tParser.m_pScanner );
585 yy2lex_destroy ( tParser.m_pScanner );
586
587 tParser.Finalize();
588
589 if ( dData.GetSizeBytes() >= 0x400000 )
590 {
591 sError = "data exceeds 0x400000 bytes";
592 iRes = -1;
593 }
594
595 if ( iRes!=0 )
596 dData.Reset();
597
598 return iRes==0;
599 }
600
601 //////////////////////////////////////////////////////////////////////////
602
sphJsonKeyMask(const char * sKey,int iLen)603 DWORD sphJsonKeyMask ( const char * sKey, int iLen )
604 {
605 DWORD uCrc = sphCRC32 ( sKey, iLen );
606 return
607 ( 1UL<<( uCrc & 31 ) ) +
608 ( 1UL<<( ( uCrc>>8 ) & 31 ) );
609 }
610
611
612 // returns -1 if size is unreachable (for remote agents)
sphJsonNodeSize(ESphJsonType eType,const BYTE * pData)613 int sphJsonNodeSize ( ESphJsonType eType, const BYTE *pData )
614 {
615 int iLen;
616 const BYTE * p = pData;
617 switch ( eType )
618 {
619 case JSON_INT32:
620 return 4;
621 case JSON_INT64:
622 case JSON_DOUBLE:
623 return 8;
624 case JSON_INT32_VECTOR:
625 if ( !p )
626 return -1;
627 iLen = sphJsonUnpackInt ( &p );
628 return p - pData + iLen * 4;
629 case JSON_INT64_VECTOR:
630 case JSON_DOUBLE_VECTOR:
631 if ( !p )
632 return -1;
633 iLen = sphJsonUnpackInt ( &p );
634 return p - pData + iLen * 8;
635 case JSON_STRING:
636 case JSON_STRING_VECTOR:
637 case JSON_MIXED_VECTOR:
638 case JSON_OBJECT:
639 if ( !p )
640 return -1;
641 iLen = sphJsonUnpackInt ( &p );
642 return p - pData + iLen;
643 case JSON_ROOT:
644 if ( !p )
645 return -1;
646 p += 4; // skip filter
647 for ( ;; )
648 {
649 ESphJsonType eNode = (ESphJsonType) *p++;
650 if ( eNode==JSON_EOF )
651 break;
652 // skip key and node
653 iLen = sphJsonUnpackInt ( &p );
654 p += iLen;
655 sphJsonSkipNode ( eNode, &p );
656 }
657 return p - pData;
658 default:
659 return 0;
660 }
661 }
662
663
sphJsonSkipNode(ESphJsonType eType,const BYTE ** ppData)664 void sphJsonSkipNode ( ESphJsonType eType, const BYTE ** ppData )
665 {
666 int iSize = sphJsonNodeSize ( eType, *ppData );
667 *ppData += iSize;
668 }
669
670
sphJsonFieldLength(ESphJsonType eType,const BYTE * pData)671 int sphJsonFieldLength ( ESphJsonType eType, const BYTE * pData )
672 {
673 const BYTE * p = pData;
674 int iCount = 0;
675 switch ( eType )
676 {
677 case JSON_INT32:
678 case JSON_INT64:
679 case JSON_DOUBLE:
680 return 1;
681 case JSON_STRING_VECTOR:
682 case JSON_MIXED_VECTOR:
683 sphJsonUnpackInt ( &p );
684 return sphJsonUnpackInt ( &p );
685 case JSON_INT32_VECTOR:
686 case JSON_INT64_VECTOR:
687 case JSON_DOUBLE_VECTOR:
688 return sphJsonUnpackInt ( &p );
689 case JSON_OBJECT:
690 case JSON_ROOT:
691 if ( eType==JSON_OBJECT )
692 sphJsonUnpackInt ( &p ); // skip size
693 p += 4; // skip filter
694 for ( ;; )
695 {
696 ESphJsonType eNode = (ESphJsonType) *p++;
697 if ( eNode==JSON_EOF )
698 break;
699 int iLen = sphJsonUnpackInt ( &p );
700 p += iLen;
701 sphJsonSkipNode ( eNode, &p );
702 iCount++;
703 }
704 return iCount;
705 default:
706 return 0;
707 }
708 }
709
710
sphJsonFindFirst(const BYTE ** ppData)711 ESphJsonType sphJsonFindFirst ( const BYTE ** ppData )
712 {
713 // non-zero bloom mask? that is JSON_ROOT (basically a JSON_OBJECT without node header)
714 if ( sphGetDword(*ppData) )
715 return JSON_ROOT;
716
717 // zero mask? must be followed by the type byte (typically JSON_EOF)
718 ESphJsonType eType = (ESphJsonType)((*ppData)[4]);
719 *ppData += 5;
720 return eType;
721 }
722
723
sphJsonFindByKey(ESphJsonType eType,const BYTE ** ppValue,const void * pKey,int iLen,DWORD uMask)724 ESphJsonType sphJsonFindByKey ( ESphJsonType eType, const BYTE ** ppValue, const void * pKey, int iLen, DWORD uMask )
725 {
726 if ( eType!=JSON_OBJECT && eType!=JSON_ROOT )
727 return JSON_EOF;
728
729 const BYTE * p = *ppValue;
730 if ( eType==JSON_OBJECT )
731 sphJsonUnpackInt ( &p );
732
733 if ( ( sphGetDword(p) & uMask )!=uMask )
734 return JSON_EOF;
735
736 p += 4;
737 for ( ;; )
738 {
739 eType = (ESphJsonType) *p++;
740 if ( eType==JSON_EOF )
741 break;
742 int iStrLen = sphJsonUnpackInt ( &p );
743 p += iStrLen;
744 if ( iStrLen==iLen && !memcmp ( p-iStrLen, pKey, iStrLen ) )
745 {
746 *ppValue = p;
747 return eType;
748 }
749 sphJsonSkipNode ( eType, &p );
750 }
751
752 return JSON_EOF;
753 }
754
755
sphJsonFindByIndex(ESphJsonType eType,const BYTE ** ppValue,int iIndex)756 ESphJsonType sphJsonFindByIndex ( ESphJsonType eType, const BYTE ** ppValue, int iIndex )
757 {
758 if ( iIndex<0 )
759 return JSON_EOF;
760
761 const BYTE * p = *ppValue;
762 switch ( eType )
763 {
764 case JSON_INT32_VECTOR:
765 case JSON_INT64_VECTOR:
766 case JSON_DOUBLE_VECTOR:
767 {
768 int iLen = sphJsonUnpackInt ( &p );
769 if ( iIndex>=iLen )
770 return JSON_EOF;
771 p += iIndex * ( eType==JSON_INT32_VECTOR ? 4 : 8 );
772 *ppValue = p;
773 return eType==JSON_INT32_VECTOR ? JSON_INT32
774 : eType==JSON_INT64_VECTOR ? JSON_INT64
775 : JSON_DOUBLE;
776 }
777 case JSON_STRING_VECTOR:
778 {
779 sphJsonUnpackInt ( &p );
780 int iLen = sphJsonUnpackInt ( &p );
781 if ( iIndex>=iLen )
782 return JSON_EOF;
783 for ( int i=0; i<iIndex; i++ )
784 {
785 int iStrLen = sphJsonUnpackInt ( &p );
786 p += iStrLen;
787 }
788 *ppValue = p;
789 return JSON_STRING;
790 }
791 case JSON_MIXED_VECTOR:
792 {
793 sphJsonUnpackInt ( &p );
794 int iLen = sphJsonUnpackInt ( &p );
795 if ( iIndex>=iLen )
796 return JSON_EOF;
797 for ( int i=0; i<iIndex; i++ )
798 {
799 eType = (ESphJsonType)*p++;
800 sphJsonSkipNode ( eType, &p );
801 }
802 eType = (ESphJsonType)*p;
803 *ppValue = p+1;
804 return eType;
805 }
806 default:
807 return JSON_EOF;
808 break;
809 }
810 }
811
812 //////////////////////////////////////////////////////////////////////////
813
JsonFormatStr(CSphVector<BYTE> & dOut,const BYTE * p,bool bQuote=true)814 static const BYTE * JsonFormatStr ( CSphVector<BYTE> & dOut, const BYTE * p, bool bQuote=true )
815 {
816 int iLen = sphJsonUnpackInt ( &p );
817 dOut.Reserve ( dOut.GetLength()+iLen );
818 if ( bQuote )
819 dOut.Add ( '"' );
820 while ( iLen-- )
821 {
822 if ( bQuote )
823 {
824 switch ( *p )
825 {
826 case '\b': dOut.Add('\\'); dOut.Add('b'); break;
827 case '\n': dOut.Add('\\'); dOut.Add('n'); break;
828 case '\r': dOut.Add('\\'); dOut.Add('r'); break;
829 case '\t': dOut.Add('\\'); dOut.Add('t'); break;
830 case '\f': dOut.Add('\\'); dOut.Add('f'); break; // formfeed (rfc 4627)
831 default:
832 if ( *p == '"' || *p=='\\' || *p=='/' )
833 dOut.Add ( '\\' );
834 dOut.Add ( *p );
835 }
836 } else
837 dOut.Add ( *p );
838 p++;
839 }
840 if ( bQuote )
841 dOut.Add ( '"' );
842 return p;
843 }
844
845
JsonAddStr(CSphVector<BYTE> & dOut,const char * pStr)846 void JsonAddStr ( CSphVector<BYTE> & dOut, const char * pStr )
847 {
848 while ( *pStr )
849 dOut.Add ( *pStr++ );
850 }
851
852
sphJsonFormat(CSphVector<BYTE> & dOut,const BYTE * pData)853 void sphJsonFormat ( CSphVector<BYTE> & dOut, const BYTE * pData )
854 {
855 if ( !pData )
856 return;
857 ESphJsonType eType = sphJsonFindFirst ( &pData );
858
859 // check for the empty root
860 if ( eType==JSON_EOF )
861 {
862 JsonAddStr ( dOut, "{}" );
863 return;
864 }
865
866 sphJsonFieldFormat ( dOut, pData, eType );
867 }
868
869
sphJsonFieldFormat(CSphVector<BYTE> & dOut,const BYTE * pData,ESphJsonType eType,bool bQuoteString)870 const BYTE * sphJsonFieldFormat ( CSphVector<BYTE> & dOut, const BYTE * pData, ESphJsonType eType, bool bQuoteString )
871 {
872 const BYTE * p = pData;
873
874 // format value
875 switch ( eType )
876 {
877 case JSON_INT32:
878 {
879 int iOff = dOut.GetLength();
880 dOut.Resize ( iOff+32 );
881 int iLen = snprintf ( (char *)dOut.Begin()+iOff, 32, "%d", sphJsonLoadInt ( &p ) ); // NOLINT
882 dOut.Resize ( iOff+iLen );
883 break;
884 }
885 case JSON_INT64:
886 {
887 int iOff = dOut.GetLength();
888 dOut.Resize ( iOff+32 );
889 int iLen = snprintf ( (char *)dOut.Begin()+iOff, 32, INT64_FMT, sphJsonLoadBigint ( &p ) ); // NOLINT
890 dOut.Resize ( iOff+iLen );
891 break;
892 }
893 case JSON_DOUBLE:
894 {
895 int iOff = dOut.GetLength();
896 dOut.Resize ( iOff+32 );
897 int iLen = snprintf ( (char *)dOut.Begin()+iOff, 32, "%lf", sphQW2D ( sphJsonLoadBigint ( &p ) ) ); // NOLINT
898 dOut.Resize ( iOff+iLen );
899 break;
900 }
901 case JSON_STRING:
902 p = JsonFormatStr ( dOut, p, bQuoteString );
903 break;
904 case JSON_STRING_VECTOR:
905 {
906 int iLen = sphJsonUnpackInt ( &p );
907 dOut.Reserve ( dOut.GetLength()+iLen );
908 int iVals = sphJsonUnpackInt ( &p );
909 dOut.Add ( '[' );
910 for ( int i=0; i<iVals; i++ )
911 {
912 if ( i>0 )
913 dOut.Add ( ',' );
914 p = JsonFormatStr ( dOut, p );
915 }
916 dOut.Add ( ']' );
917 break;
918 }
919 case JSON_INT32_VECTOR:
920 case JSON_INT64_VECTOR:
921 case JSON_DOUBLE_VECTOR:
922 {
923 int iVals = sphJsonUnpackInt ( &p );
924 dOut.Add ( '[' );
925 for ( int i=0; i<iVals; i++ )
926 {
927 if ( i>0 )
928 dOut.Add ( ',' );
929 int iOff = dOut.GetLength();
930 dOut.Resize ( iOff+32 );
931 int iLen = 0;
932 char * b = (char *)dOut.Begin()+iOff;
933 switch ( eType )
934 {
935 case JSON_INT32_VECTOR: iLen = snprintf ( b, 32, "%d", sphJsonLoadInt ( &p ) ); break; // NOLINT
936 case JSON_INT64_VECTOR: iLen = snprintf ( b, 32, INT64_FMT, sphJsonLoadBigint ( &p ) ); break; // NOLINT
937 case JSON_DOUBLE_VECTOR: iLen = snprintf ( b, 32, "%lf", sphQW2D ( sphJsonLoadBigint ( &p ) ) ); break; // NOLINT
938 default:
939 break;
940 }
941 dOut.Resize ( iOff+iLen );
942 }
943 dOut.Add ( ']' );
944 break;
945 }
946 case JSON_MIXED_VECTOR:
947 {
948 sphJsonUnpackInt ( &p );
949 int iVals = sphJsonUnpackInt ( &p );
950 dOut.Add ( '[' );
951 for ( int i=0; i<iVals; i++ )
952 {
953 if ( i>0 )
954 dOut.Add ( ',' );
955 ESphJsonType eNode = (ESphJsonType) *p++;
956 p = sphJsonFieldFormat ( dOut, p, eNode, true );
957 }
958 dOut.Add ( ']' );
959 break;
960 }
961 case JSON_ROOT:
962 case JSON_OBJECT:
963 {
964 if ( eType==JSON_OBJECT )
965 sphJsonUnpackInt ( &p );
966 p += 4; // skip bloom table
967 dOut.Add ( '{' );
968 for ( int i=0;;i++ )
969 {
970 ESphJsonType eNode = (ESphJsonType) *p++;
971 if ( eNode==JSON_EOF )
972 break;
973 if ( i>0 )
974 dOut.Add ( ',' );
975 p = JsonFormatStr ( dOut, p );
976 dOut.Add ( ':' );
977 p = sphJsonFieldFormat ( dOut, p, eNode, true );
978 }
979 dOut.Add ( '}' );
980 break;
981 }
982 case JSON_TRUE: JsonAddStr ( dOut, bQuoteString ? "true" : "1" ); break;
983 case JSON_FALSE: JsonAddStr ( dOut, bQuoteString ? "false" : "0" ); break;
984 case JSON_NULL: JsonAddStr ( dOut, bQuoteString ? "null" : "" ); break;
985 case JSON_EOF: break;
986 case JSON_TOTAL: break;
987 }
988
989 return p;
990 }
991
992
sphJsonNameSplit(const char * sName,CSphString * sColumn,CSphString * sKey)993 bool sphJsonNameSplit ( const char * sName, CSphString * sColumn, CSphString * sKey )
994 {
995 if ( !sName )
996 return false;
997
998 // find either '[' or '.', what comes first
999 const char * pSep = sName;
1000 while ( *pSep && *pSep!='.' && *pSep!='[' )
1001 {
1002 // check for invalid characters
1003 if ( !sphIsAttr( *pSep ) && *pSep!=' ' )
1004 return false;
1005 pSep++;
1006 }
1007
1008 if ( !*pSep )
1009 return false;
1010
1011 int iSep = pSep - sName;
1012 if ( sColumn )
1013 {
1014 sColumn->SetBinary ( sName, iSep );
1015 sColumn->Trim();
1016 }
1017
1018 if ( sKey )
1019 *sKey = sName + iSep + ( *pSep=='.' ? 1 : 0 );
1020
1021 return true;
1022 }
1023
1024
JsonKey_t()1025 JsonKey_t::JsonKey_t ()
1026 : m_uMask ( 0 )
1027 , m_iLen ( 0 )
1028 {}
1029
1030
JsonKey_t(const char * sKey,int iLen)1031 JsonKey_t::JsonKey_t ( const char * sKey, int iLen )
1032 {
1033 m_iLen = iLen;
1034 m_uMask = sphJsonKeyMask ( sKey, m_iLen );
1035 m_sKey.SetBinary ( sKey, m_iLen );
1036 }
1037
1038
JsonStoreInt(BYTE * p,int v)1039 void JsonStoreInt ( BYTE * p, int v )
1040 {
1041 *p++ = BYTE(DWORD(v));
1042 *p++ = BYTE(DWORD(v) >> 8);
1043 *p++ = BYTE(DWORD(v) >> 16);
1044 *p++ = BYTE(DWORD(v) >> 24);
1045 }
1046
1047
JsonStoreBigint(BYTE * p,int64_t v)1048 void JsonStoreBigint ( BYTE * p, int64_t v )
1049 {
1050 JsonStoreInt ( p, (DWORD)( v & 0xffffffffUL ) );
1051 JsonStoreInt ( p+4, (int)( v>>32 ) );
1052 }
1053
1054
sphJsonInplaceUpdate(ESphJsonType eValueType,int64_t iValue,ISphExpr * pExpr,BYTE * pStrings,const CSphRowitem * pRow,bool bUpdate)1055 bool sphJsonInplaceUpdate ( ESphJsonType eValueType, int64_t iValue, ISphExpr * pExpr, BYTE * pStrings, const CSphRowitem * pRow, bool bUpdate )
1056 {
1057 if ( !pExpr || !pStrings )
1058 return false;
1059
1060 pExpr->Command ( SPH_EXPR_SET_STRING_POOL, (void*)pStrings );
1061
1062 CSphMatch tMatch;
1063 tMatch.m_pStatic = pRow;
1064
1065 uint64_t uPacked = pExpr->Int64Eval ( tMatch );
1066 BYTE * pData = pStrings + ( uPacked & 0xffffffff );
1067 ESphJsonType eType = (ESphJsonType)( uPacked >> 32 );
1068
1069 switch ( eType )
1070 {
1071 case JSON_INT32:
1072 if ( eValueType==JSON_DOUBLE )
1073 iValue = (int64_t)sphQW2D ( iValue );
1074 if ( int64_t(int(iValue))!=iValue )
1075 return false;
1076 if ( bUpdate )
1077 JsonStoreInt ( pData, (int)iValue );
1078 break;
1079 case JSON_INT64:
1080 if ( bUpdate )
1081 JsonStoreBigint ( pData, eValueType==JSON_DOUBLE ? (int64_t)sphQW2D ( iValue ) : iValue );
1082 break;
1083 case JSON_DOUBLE:
1084 if ( bUpdate )
1085 JsonStoreBigint ( pData, eValueType==JSON_DOUBLE ? iValue : sphD2QW ( (double)iValue ) );
1086 break;
1087 default:
1088 return false;
1089 }
1090 return true;
1091 }
1092
1093
sphJsonStringToNumber(const char * s,int iLen,ESphJsonType & eType,int64_t & iVal,double & fVal)1094 bool sphJsonStringToNumber ( const char * s, int iLen, ESphJsonType & eType, int64_t & iVal, double & fVal )
1095 {
1096 // skip whitespace
1097 while ( iLen>0 && ( *s==' ' || *s=='\n' || *s=='\r' || *s=='\t' || *s=='\f' ) )
1098 s++, iLen--;
1099
1100 if ( iLen<=0 )
1101 return false;
1102
1103 // check whether the string looks like a numeric
1104 const char * p = s;
1105 const char * pEnd = p+iLen-1;
1106 bool bNumeric = ( *p=='-' || *p=='.' || ( *p>='0' && *p<='9' ) );
1107 bool bDot = ( *p=='.' );
1108 bool bExp = false;
1109 bool bExpSign = false;
1110 while ( bNumeric && p<pEnd )
1111 {
1112 p++;
1113 switch ( *p )
1114 {
1115 case '.':
1116 if ( bDot )
1117 bNumeric = false;
1118 bDot = true;
1119 break;
1120 case 'e':
1121 case 'E':
1122 if ( bExp )
1123 bNumeric = false;
1124 bExp = true;
1125 break;
1126 case '-':
1127 case '+':
1128 if ( !bExp || bExpSign )
1129 bNumeric = false;
1130 bExpSign = true;
1131 break;
1132 default:
1133 if ( *p<'0' || *p >'9' )
1134 bNumeric = false;
1135 }
1136 }
1137
1138 // convert string to number
1139 if ( bNumeric && iLen<32 )
1140 {
1141 char sVal[32];
1142 memcpy ( sVal, s, iLen );
1143 sVal[iLen] = '\0';
1144 char * pCur;
1145
1146 // setting errno to zero is necessary because strtod/strtoll do not indicate
1147 // whether it was an overflow or a valid input for borderline values
1148 errno = 0;
1149
1150 if ( bDot || bExp )
1151 {
1152 double fRes = strtod ( sVal, &pCur );
1153 if ( pCur==sVal+iLen && errno!=ERANGE )
1154 {
1155 eType = JSON_DOUBLE;
1156 fVal = fRes;
1157 return true;
1158 }
1159
1160 } else
1161 {
1162 int64_t iRes = strtoll ( sVal, &pCur, 10 );
1163 if ( pCur==sVal+iLen && errno!=ERANGE )
1164 {
1165 eType = JSON_INT64;
1166 iVal = iRes;
1167 return true;
1168 }
1169 }
1170 }
1171
1172 return false;
1173 }
1174
1175 //
1176 // $Id$
1177 //
1178