1 /* Must be defined before including Perl header files or we slow down by 2x! */
2 #define PERL_NO_GET_CONTEXT
3
4 #ifdef __cplusplus
5 extern "C" {
6 #endif
7 #include "EXTERN.h"
8 #include "perl.h"
9 #include "XSUB.h"
10 #include "ppport.h"
11 #ifdef __cplusplus
12 }
13 #endif
14
15 #include <stdlib.h>
16
17 #ifndef PERL_VERSION
18 # include <patchlevel.h>
19 # if !(defined(PERL_VERSION) || (PERL_SUBVERSION > 0 && defined(PATCHLEVEL)))
20 # include <could_not_find_Perl_patchlevel.h>
21 # endif
22 # define PERL_REVISION 5
23 # define PERL_VERSION PATCHLEVEL
24 # define PERL_SUBVERSION PERL_SUBVERSION
25 #endif
26 #if PERL_VERSION < 8
27 # define PERL_MAGIC_qr 'r' /* precompiled qr// regex */
28 # define BFD_Svs_SMG_OR_RMG SVs_RMG
29 #elif ((PERL_VERSION==8) && (PERL_SUBVERSION >= 1) || (PERL_VERSION>8))
30 # define BFD_Svs_SMG_OR_RMG SVs_SMG
31 # define MY_PLACEHOLDER PL_sv_placeholder
32 #else
33 # define BFD_Svs_SMG_OR_RMG SVs_RMG
34 # define MY_PLACEHOLDER PL_sv_undef
35 #endif
36 #if (((PERL_VERSION == 9) && (PERL_SUBVERSION >= 4)) || (PERL_VERSION > 9))
37 # define NEW_REGEX_ENGINE 1
38 #endif
39 #if (((PERL_VERSION == 8) && (PERL_SUBVERSION >= 1)) || (PERL_VERSION > 8))
40 #define MY_CAN_FIND_PLACEHOLDERS
41 #define HAS_SV2OBJ
42 #endif
43
44 /* hv_backreferences_p is not marked as exported in embed.fnc in any perl */
45 #if (PERL_VERSION >= 10)
46 #define HAS_HV_BACKREFS
47 #endif
48
49 #include "srl_protocol.h"
50 #include "srl_encoder.h"
51 #include "srl_common.h"
52 #include "ptable.h"
53 #include "srl_buffer.h"
54 #include "srl_compress.h"
55 #include "qsort.h"
56
57 /* The ENABLE_DANGEROUS_HACKS (passed through from ENV via Makefile.PL) enables
58 * optimizations that may make the code so cozy with a particular version of the
59 * Perl core that the code is no longer portable and/or compatible.
60 * It would be great to determine where these hacks are safe and enable them
61 * where possible. Gut feeling as for portability is that most things will be
62 * ok on Unixes, but fail on the stricter Win32. As for compatibility with old
63 * versions of perl, all bets are off.
64 */
65 #ifdef ENABLE_DANGEROUS_HACKS
66 /* It's unclear why DO_SHARED_HASH_ENTRY_REFCOUNT_CHECK doesn't
67 * help much. It basically means breaking perl's encapsulation to
68 * check whether a HE (hash entry) that is shared has a refcount > 1
69 * and only bothers inserting key into our ptr table if that's the
70 * case. Benchmarks don't show much of a difference and it's a high
71 * price to pay to break encapsulation for something that's not
72 * measureable.
73 */
74 /* DO_SHARED_HASH_ENTRY_REFCOUNT_CHECK only works on 5.10 and better */
75 # define DO_SHARED_HASH_ENTRY_REFCOUNT_CHECK 1
76 #else
77 # define DO_SHARED_HASH_ENTRY_REFCOUNT_CHECK 0
78 #endif
79
80 #define DEFAULT_MAX_RECUR_DEPTH 10000
81
82 #define DEBUGHACK 0
83
84 /* some static function declarations */
85 SRL_STATIC_INLINE void srl_clear_seen_hashes(pTHX_ srl_encoder_t *enc);
86 static void srl_dump_sv(pTHX_ srl_encoder_t *enc, SV *src);
87 SRL_STATIC_INLINE void srl_dump_svpv(pTHX_ srl_encoder_t *enc, SV *src);
88 SRL_STATIC_INLINE void srl_dump_pv(pTHX_ srl_encoder_t *enc, const char* src, STRLEN src_len, int is_utf8);
89 SRL_STATIC_INLINE void srl_fixup_weakrefs(pTHX_ srl_encoder_t *enc);
90 SRL_STATIC_INLINE void srl_dump_av(pTHX_ srl_encoder_t *enc, AV *src, U32 refcnt);
91 SRL_STATIC_INLINE void srl_dump_hv(pTHX_ srl_encoder_t *enc, HV *src, U32 refcnt);
92 SRL_STATIC_INLINE void srl_dump_hk(pTHX_ srl_encoder_t *enc, HE *src, const int share_keys);
93 SRL_STATIC_INLINE void srl_dump_nv(pTHX_ srl_encoder_t *enc, SV *src);
94 SRL_STATIC_INLINE void srl_dump_ivuv(pTHX_ srl_encoder_t *enc, SV *src);
95 SRL_STATIC_INLINE int srl_dump_classname(pTHX_ srl_encoder_t *enc, SV *referent, SV *replacement);
96 SRL_STATIC_INLINE SV *srl_get_frozen_object(pTHX_ srl_encoder_t *enc, SV *src, SV *referent);
97 SRL_STATIC_INLINE PTABLE_t *srl_init_string_hash(srl_encoder_t *enc);
98 SRL_STATIC_INLINE PTABLE_t *srl_init_ref_hash(srl_encoder_t *enc);
99 SRL_STATIC_INLINE PTABLE_t *srl_init_freezeobj_svhash(srl_encoder_t *enc);
100 SRL_STATIC_INLINE PTABLE_t *srl_init_weak_hash(srl_encoder_t *enc);
101 SRL_STATIC_INLINE HV *srl_init_string_deduper_hv(pTHX_ srl_encoder_t *enc);
102
103 /* Note: This returns an encoder struct pointer because it will
104 * clone the current encoder struct if it's dirty. That in
105 * turn means in order to access the output buffer, you need
106 * to inspect the returned encoder struct. If necessary, it
107 * will be cleaned up automatically by Perl, so don't bother
108 * freeing it. */
109 SRL_STATIC_INLINE srl_encoder_t *srl_dump_data_structure(pTHX_ srl_encoder_t *enc, SV *src, SV *user_header_src);
110
111 #define SRL_GET_STR_DEDUPER_HV(enc) ( (enc)->string_deduper_hv == NULL \
112 ? srl_init_string_deduper_hv(aTHX_ enc) \
113 : (enc)->string_deduper_hv )
114
115 #define SRL_GET_STR_PTR_SEENHASH(enc) ( (enc)->str_seenhash == NULL \
116 ? srl_init_string_hash(enc) \
117 : (enc)->str_seenhash )
118
119 #define SRL_GET_REF_SEENHASH(enc) ( (enc)->ref_seenhash == NULL \
120 ? srl_init_ref_hash(enc) \
121 : (enc)->ref_seenhash )
122
123 #define SRL_GET_WEAK_SEENHASH(enc) ( (enc)->weak_seenhash == NULL \
124 ? srl_init_weak_hash(enc) \
125 : (enc)->weak_seenhash )
126
127 #define SRL_GET_WEAK_SEENHASH_OR_NULL(enc) ((enc)->weak_seenhash)
128
129 #define SRL_GET_FREEZEOBJ_SVHASH(enc) ( (enc)->freezeobj_svhash == NULL \
130 ? srl_init_freezeobj_svhash(enc) \
131 : (enc)->freezeobj_svhash )
132
133 #define SRL_ENC_UPDATE_BODY_POS(enc) SRL_UPDATE_BODY_POS(&(enc)->buf, (enc)->protocol_version)
134
135 #ifndef MAX_CHARSET_NAME_LENGTH
136 # define MAX_CHARSET_NAME_LENGTH 2
137 #endif
138
139 #if PERL_VERSION == 10
140 /*
141 Apparently regexes in 5.10 are "modern" but with 5.8 internals
142 */
143 #ifndef RXf_PMf_STD_PMMOD_SHIFT
144 # define RXf_PMf_STD_PMMOD_SHIFT 12
145 #endif
146 #ifndef RE_EXTFLAGS
147 # define RX_EXTFLAGS(re) ((re)->extflags)
148 #endif
149 #ifndef RX_PRECOMP
150 # define RX_PRECOMP(re) ((re)->precomp)
151 #endif
152 #ifndef RX_PRELEN
153 # define RX_PRELEN(re) ((re)->prelen)
154 #endif
155
156 /* Maybe this is only on OS X, where SvUTF8(sv) exists but looks at flags that don't exist */
157 #ifndef RX_UTF8
158 # define RX_UTF8(re) (RX_EXTFLAGS(re) & RXf_UTF8)
159 #endif
160
161 #elif defined(SvRX)
162 # define MODERN_REGEXP
163 # if ( PERL_VERSION > 27 || (PERL_VERSION == 27 && PERL_SUBVERSION >= 3) )
164 /* Commit df6b4bd56551f2d39f7c0019c23f27181d8c39c4
165 * changed the behavior mentioned below, so that the POK flag is on again. Sigh.
166 * So this branch is a deliberate NO-OP, it just makes the conditions easier to read.*/
167 # elif ( PERL_VERSION > 17 || (PERL_VERSION == 17 && PERL_SUBVERSION >= 6) )
168 /* With commit 8d919b0a35f2b57a6bed2f8355b25b19ac5ad0c5 (perl.git) and
169 * release 5.17.6, regular expression are no longer SvPOK (IOW are no longer
170 * considered to be containing a string).
171 * This breaks some of the REGEXP detection logic in srl_dump_sv, so
172 * we need yet another CPP define. */
173 # define REGEXP_NO_LONGER_POK
174 # endif
175 #else
176 # define INT_PAT_MODS "msix"
177 # define RXf_PMf_STD_PMMOD_SHIFT 12
178 # define RX_PRECOMP(re) ((re)->precomp)
179 # define RX_PRELEN(re) ((re)->prelen)
180 # define RX_UTF8(re) ((re)->reganch & ROPT_UTF8)
181 # define RX_EXTFLAGS(re) ((re)->reganch)
182 # define RXf_PMf_COMPILETIME PMf_COMPILETIME
183 #endif
184
185 #if defined(MODERN_REGEXP) && !defined(REGEXP_NO_LONGER_POK)
186 #define DO_POK_REGEXP(enc, src, svt) \
187 /* Only need to enter here if we have rather modern regexps,*/ \
188 /* but they're still POK (pre 5.17.6). */ \
189 if (expect_false( svt == SVt_REGEXP ) ) { \
190 srl_dump_regexp(aTHX_ enc, src); \
191 } \
192 else
193 #else
194 #define DO_POK_REGEXP(enc, src, svt) /*no-op*/
195 #endif
196
197 #define _SRL_IF_SIMPLE_DIRECT_DUMP_SV(enc, src, svt) \
198 if (SvPOK(src)) { \
199 STRLEN L; \
200 char *PV= SvPV(src, L); \
201 if ( SvIOK(src) ) { \
202 if ( SvIV(src) == 0 ) { \
203 if ( L == 1 && PV[0] == '0' ) { \
204 /* its a true 0 */ \
205 srl_buf_cat_char(&enc->buf, SRL_HDR_POS + 0); \
206 } \
207 else { \
208 /* must be a string */ \
209 srl_dump_svpv(aTHX_ enc, src); \
210 } \
211 } \
212 else \
213 if ( \
214 !L || \
215 !isDIGIT(PV[L-1]) || \
216 ( \
217 SvIV(src) > 0 \
218 ? ( PV[0] == '0' || !isDIGIT(PV[0]) ) \
219 : ( L < 2 || PV[0] != '-' || PV[1] == '0' || !isDIGIT(PV[1]) ) \
220 ) \
221 ) { \
222 srl_dump_svpv(aTHX_ enc, src); \
223 } \
224 else { \
225 if ( SvNOK(src) ) { \
226 /* fallback to checking if the canonical stringified*/ \
227 /* int is the same as the buffer */ \
228 sv_setiv(enc->scratch_sv,SvIV(src)); \
229 if ( sv_cmp(enc->scratch_sv,src) ) { \
230 srl_dump_svpv(aTHX_ enc, src); \
231 } else { \
232 srl_dump_ivuv(aTHX_ enc, src); \
233 } \
234 } else { \
235 srl_dump_ivuv(aTHX_ enc, src); \
236 } \
237 } \
238 } \
239 else \
240 if ( SvNOK(src) ) { \
241 if ( L <= 8 || \
242 !isDIGIT(PV[0]) || \
243 !isDIGIT(PV[L-1]) || \
244 PV[L-1] == '0' || \
245 ( \
246 SvNV(src) > 0.0 \
247 ? ( PV[0] == '.' || (PV[0] == '0' && PV[1] != '.') ) \
248 : ( PV[0] != '-' || PV[1] == '.' || (PV[1] == '0' && PV[2] != '.')) \
249 ) \
250 ) { \
251 srl_dump_svpv(aTHX_ enc, src); \
252 } \
253 else { \
254 srl_dump_nv(aTHX_ enc, src); \
255 } \
256 } \
257 else { \
258 DO_POK_REGEXP(enc,src,svt) \
259 srl_dump_svpv(aTHX_ enc, src); \
260 } \
261 } \
262 else \
263 if ( SvIOK(src) ) { \
264 srl_dump_ivuv(aTHX_ enc, src); \
265 } \
266 else \
267 /* if its a float then its a float */ \
268 if (SvNOK(src)) { \
269 srl_dump_nv(aTHX_ enc, src); \
270 } \
271 else \
272 /* The POKp, IOKp, NOKp checks below deal with PVLV */ \
273 /* if its POK or POKp, then we treat it as a string */ \
274 if (SvPOKp(src)) { \
275 DO_POK_REGEXP(enc,src,svt) \
276 srl_dump_svpv(aTHX_ enc, src); \
277 } \
278 else \
279 /* if its IOKp then we treat it as an int */ \
280 if (SvIOKp(src)) { \
281 srl_dump_ivuv(aTHX_ enc, src); \
282 } \
283 else \
284 /* if its NOKp then we treat it as an nv */ \
285 if (SvNOKp(src)) { \
286 srl_dump_nv(aTHX_ enc, src); \
287 } \
288
289 #define CALL_SRL_DUMP_SV(enc, src) STMT_START { \
290 if (!(src)) { \
291 srl_buf_cat_char(&(enc)->buf, SRL_HDR_CANONICAL_UNDEF); /* is this right? */\
292 } \
293 else \
294 { \
295 svtype svt; \
296 SvGETMAGIC(src); \
297 svt= SvTYPE((src)); \
298 if (svt < SVt_PVMG && \
299 SvREFCNT((src)) == 1 && \
300 !SvROK((src)) \
301 ) { \
302 _SRL_IF_SIMPLE_DIRECT_DUMP_SV(enc, src, svt) \
303 else { \
304 srl_dump_sv(aTHX_ (enc), (src)); \
305 } \
306 } else { \
307 srl_dump_sv(aTHX_ (enc), (src)); \
308 } \
309 } \
310 } STMT_END
311
312 #define CALL_SRL_DUMP_SVP(enc, srcp) STMT_START { \
313 if (!(srcp)) { \
314 srl_buf_cat_char(&(enc)->buf, SRL_HDR_CANONICAL_UNDEF); /* is this right? */\
315 } else { \
316 SV *src= *srcp; \
317 CALL_SRL_DUMP_SV(enc,src); \
318 } \
319 } STMT_END
320
321 /* This is fired when we exit the Perl pseudo-block.
322 * It frees our encoder and all. Put encoder-level cleanup
323 * logic here so that we can simply use croak/longjmp for
324 * exception handling. Makes life vastly easier!
325 */
326 void
srl_destructor_hook(pTHX_ void * p)327 srl_destructor_hook(pTHX_ void *p)
328 {
329 srl_encoder_t *enc = (srl_encoder_t *)p;
330 /* Do not auto-destroy encoder if set to be re-used */
331 if (!SRL_ENC_HAVE_OPTION(enc, SRL_F_REUSE_ENCODER)) {
332 /* Exception cleanup. Under normal operation, we should have
333 * assigned NULL to buf_start after we're done. */
334 srl_destroy_encoder(aTHX_ enc);
335 }
336 else {
337 srl_clear_encoder(aTHX_ enc);
338 }
339 }
340
341 SRL_STATIC_INLINE void
srl_clear_seen_hashes(pTHX_ srl_encoder_t * enc)342 srl_clear_seen_hashes(pTHX_ srl_encoder_t *enc)
343 {
344 if (enc->ref_seenhash != NULL)
345 PTABLE_clear(enc->ref_seenhash);
346 if (enc->freezeobj_svhash != NULL)
347 PTABLE_clear_dec(aTHX_ enc->freezeobj_svhash);
348 if (enc->str_seenhash != NULL)
349 PTABLE_clear(enc->str_seenhash);
350 if (enc->weak_seenhash != NULL)
351 PTABLE_clear(enc->weak_seenhash);
352 if (enc->string_deduper_hv != NULL)
353 hv_clear(enc->string_deduper_hv);
354 }
355
356 void
srl_clear_encoder(pTHX_ srl_encoder_t * enc)357 srl_clear_encoder(pTHX_ srl_encoder_t *enc)
358 {
359 /* TODO I think this could just be made an assert. */
360 if (!SRL_ENC_HAVE_OPER_FLAG(enc, SRL_OF_ENCODER_DIRTY)) {
361 warn("Sereal Encoder being cleared but in virgin state. That is unexpected.");
362 }
363
364 enc->recursion_depth = 0;
365 srl_clear_seen_hashes(aTHX_ enc);
366
367 enc->buf.pos = enc->buf.start;
368 /* tmp_buf.start may be NULL for an unused tmp_buf, but so what? */
369 enc->tmp_buf.pos = enc->tmp_buf.start;
370
371 SRL_SET_BODY_POS(&enc->buf, enc->buf.start);
372
373 SRL_ENC_RESET_OPER_FLAG(enc, SRL_OF_ENCODER_DIRTY);
374 }
375
376 void
srl_destroy_encoder(pTHX_ srl_encoder_t * enc)377 srl_destroy_encoder(pTHX_ srl_encoder_t *enc)
378 {
379 srl_buf_free_buffer(aTHX_ &enc->buf);
380
381 /* Free tmp buffer only if it was allocated at all. */
382 if (enc->tmp_buf.start != NULL)
383 srl_buf_free_buffer(aTHX_ &enc->tmp_buf);
384
385 srl_destroy_snappy_workmem(aTHX_ enc->snappy_workmem);
386
387 if (enc->ref_seenhash != NULL)
388 PTABLE_free(enc->ref_seenhash);
389 if (enc->freezeobj_svhash != NULL)
390 PTABLE_free(enc->freezeobj_svhash);
391 if (enc->str_seenhash != NULL)
392 PTABLE_free(enc->str_seenhash);
393 if (enc->weak_seenhash != NULL)
394 PTABLE_free(enc->weak_seenhash);
395 if (enc->string_deduper_hv != NULL)
396 SvREFCNT_dec(enc->string_deduper_hv);
397
398 SvREFCNT_dec(enc->sereal_string_sv);
399 SvREFCNT_dec(enc->scratch_sv);
400
401 Safefree(enc);
402 }
403
404 /* allocate an empty encoder struct - flags still to be set up */
405 SRL_STATIC_INLINE srl_encoder_t *
srl_empty_encoder_struct(pTHX)406 srl_empty_encoder_struct(pTHX)
407 {
408 srl_encoder_t *enc;
409 Newxz(enc, 1, srl_encoder_t);
410 if (enc == NULL)
411 croak("Out of memory");
412
413 /* Init buffer struct */
414 if (expect_false( srl_buf_init_buffer(aTHX_ &(enc->buf), INITIALIZATION_SIZE) != 0 )) {
415 Safefree(enc);
416 croak("Out of memory");
417 }
418
419 enc->protocol_version = SRL_PROTOCOL_VERSION;
420 enc->max_recursion_depth = DEFAULT_MAX_RECUR_DEPTH;
421
422 return enc;
423 }
424
425 #define my_hv_fetchs(he,val,opt,idx) STMT_START { \
426 he = hv_fetch_ent(opt, options[idx].sv, 0, options[idx].hash); \
427 if (he) \
428 val= HeVAL(he); \
429 else \
430 val= NULL; \
431 } STMT_END
432
433 /* Builds the C-level configuration and state struct. */
434 srl_encoder_t *
srl_build_encoder_struct(pTHX_ HV * opt,sv_with_hash * options)435 srl_build_encoder_struct(pTHX_ HV *opt, sv_with_hash *options)
436 {
437 srl_encoder_t *enc;
438 SV *val;
439 HE *he;
440
441 enc = srl_empty_encoder_struct(aTHX);
442 enc->flags = 0;
443 enc->scratch_sv= newSViv(0);
444
445 /* load options */
446 if (opt != NULL) {
447 int undef_unknown = 0;
448 int compression_format = 0;
449 /* SRL_F_SHARED_HASHKEYS on by default */
450 my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_NO_SHARED_HASHKEYS);
451 if ( !val || !SvTRUE(val) )
452 SRL_ENC_SET_OPTION(enc, SRL_F_SHARED_HASHKEYS);
453
454 /* Needs to be before the snappy options */
455 /* enc->protocol_version defaults to SRL_PROTOCOL_VERSION. */
456 my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_PROTOCOL_VERSION);
457 if (val && SvOK(val)) {
458 enc->protocol_version = SvUV(val);
459 if (enc->protocol_version < 1
460 || enc->protocol_version > SRL_PROTOCOL_VERSION)
461 {
462 croak("Specified Sereal protocol version (%"UVuf") is invalid",
463 (UV)enc->protocol_version);
464 }
465 }
466 else {
467 /* Compatibility with the old way to specify older protocol version */
468 my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_USE_PROTOCOL_V1);
469 if ( val && SvTRUE(val) )
470 enc->protocol_version = 1;
471 }
472
473 my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_CROAK_ON_BLESS);
474 if ( val && SvTRUE(val) )
475 SRL_ENC_SET_OPTION(enc, SRL_F_CROAK_ON_BLESS);
476
477 my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_NO_BLESS_OBJECTS);
478 if ( val && SvTRUE(val) )
479 SRL_ENC_SET_OPTION(enc, SRL_F_NO_BLESS_OBJECTS);
480
481 my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_FREEZE_CALLBACKS);
482 if ( val && SvTRUE(val) ) {
483 if (SRL_ENC_HAVE_OPTION(enc, SRL_F_NO_BLESS_OBJECTS))
484 croak("The no_bless_objects and freeze_callback_support "
485 "options are mutually exclusive");
486 SRL_ENC_SET_OPTION(enc, SRL_F_ENABLE_FREEZE_SUPPORT);
487 enc->sereal_string_sv = newSVpvs("Sereal");
488 }
489
490 my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_COMPRESS);
491 if (val) {
492 compression_format = SvIV(val);
493
494 /* See also Encoder.pm's constants */
495 switch (compression_format) {
496 case 0: /* uncompressed */
497 break;
498 case 1:
499 SRL_ENC_SET_OPTION(enc, SRL_F_COMPRESS_SNAPPY_INCREMENTAL);
500 break;
501 case 2:
502 SRL_ENC_SET_OPTION(enc, SRL_F_COMPRESS_ZLIB);
503 if (enc->protocol_version < 3)
504 croak("Zlib compression was introduced in protocol version 3 and you are asking for only version %i", (int)enc->protocol_version);
505
506 enc->compress_level = MZ_DEFAULT_COMPRESSION;
507 my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_COMPRESS_LEVEL);
508 if ( val && SvTRUE(val) ) {
509 IV lvl = SvIV(val);
510 if (expect_false( lvl < 1 || lvl > 10 )) /* Sekrit: compression lvl 10 is a miniz thing that doesn't exist in normal zlib */
511 croak("'compress_level' needs to be between 1 and 9");
512 enc->compress_level = lvl;
513 }
514 break;
515 case 3:
516 SRL_ENC_SET_OPTION(enc, SRL_F_COMPRESS_ZSTD);
517 if (enc->protocol_version < 3)
518 croak("zstd compression was introduced in protocol version 3 and you are asking for only version %i", (int)enc->protocol_version);
519
520 enc->compress_level = 3; /* default compression level */
521 my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_COMPRESS_LEVEL);
522 if ( val && SvTRUE(val) ) {
523 IV lvl = SvIV(val);
524 if (expect_false( lvl < 1 || lvl > 22 )) /* TODO: ZSTD_maxCLevel() */
525 croak("'compress_level' needs to be between 1 and 22");
526 enc->compress_level = lvl;
527 }
528 break;
529 default:
530 croak("Invalid Sereal compression format");
531 }
532 }
533 else {
534 /* Only bother with old compression options if necessary */
535
536 my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_SNAPPY_INCR);
537 if ( val && SvTRUE(val) ) {
538 SRL_ENC_SET_OPTION(enc, SRL_F_COMPRESS_SNAPPY_INCREMENTAL);
539 compression_format = 1;
540 }
541 else {
542 /* snappy_incr >> snappy */
543 my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_SNAPPY);
544 if ( val && SvTRUE(val) ) {
545 /* incremental is the new black in V2 */
546 if (expect_true( enc->protocol_version > 1 ))
547 SRL_ENC_SET_OPTION(enc, SRL_F_COMPRESS_SNAPPY_INCREMENTAL);
548 else
549 SRL_ENC_SET_OPTION(enc, SRL_F_COMPRESS_SNAPPY);
550 compression_format = 1;
551 }
552 }
553 }
554
555 my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_UNDEF_UNKNOWN);
556 if ( val && SvTRUE(val) ) {
557 undef_unknown = 1;
558 SRL_ENC_SET_OPTION(enc, SRL_F_UNDEF_UNKNOWN);
559 }
560
561 my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_SORT_KEYS);
562 if ( !val )
563 my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_CANONICAL);
564 if ( val && SvTRUE(val) ) {
565 SRL_ENC_SET_OPTION(enc, SRL_F_SORT_KEYS);
566 if (SvIV(val) > 1) {
567 SRL_ENC_SET_OPTION(enc, SRL_F_SORT_KEYS_PERL);
568 if (SvIV(val) > 2) {
569 SRL_ENC_SET_OPTION(enc, SRL_F_SORT_KEYS_PERL_REV);
570 }
571 }
572 }
573
574 my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_CANONICAL_REFS);
575 if ( !val )
576 my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_CANONICAL);
577 if ( val && SvTRUE(val) )
578 SRL_ENC_SET_OPTION(enc, SRL_F_CANONICAL_REFS);
579
580 my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_ALIASED_DEDUPE_STRINGS);
581 if ( val && SvTRUE(val) )
582 SRL_ENC_SET_OPTION(enc, SRL_F_ALIASED_DEDUPE_STRINGS | SRL_F_DEDUPE_STRINGS);
583 else {
584 my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_DEDUPE_STRINGS);
585 if ( val && SvTRUE(val) )
586 SRL_ENC_SET_OPTION(enc, SRL_F_DEDUPE_STRINGS);
587 }
588
589 my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_STRINGIFY_UNKNOWN);
590 if ( val && SvTRUE(val) ) {
591 if (expect_false( undef_unknown ))
592 croak("'undef_unknown' and 'stringify_unknown' "
593 "options are mutually exclusive");
594 SRL_ENC_SET_OPTION(enc, SRL_F_STRINGIFY_UNKNOWN);
595 }
596
597 my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_WARN_UNKNOWN);
598 if ( val && SvTRUE(val) ) {
599 SRL_ENC_SET_OPTION(enc, SRL_F_WARN_UNKNOWN);
600 if (SvIV(val) < 0)
601 SRL_ENC_SET_OPTION(enc, SRL_F_NOWARN_UNKNOWN_OVERLOAD);
602 }
603
604 if (compression_format) {
605 enc->compress_threshold = 1024;
606 my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_COMPRESS_THRESHOLD);
607 if ( val && SvOK(val) )
608 enc->compress_threshold = SvIV(val);
609 else if (compression_format == 1) {
610 /* compression_format==1 is some sort of Snappy */
611 my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_SNAPPY_THRESHOLD);
612 if ( val && SvOK(val) )
613 enc->compress_threshold = SvIV(val);
614 }
615 }
616
617 my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_MAX_RECURSION_DEPTH);
618 if ( val && SvTRUE(val) )
619 enc->max_recursion_depth = SvUV(val);
620 }
621 else {
622 /* SRL_F_SHARED_HASHKEYS on by default */
623 SRL_ENC_SET_OPTION(enc, SRL_F_SHARED_HASHKEYS);
624 }
625
626 DEBUG_ASSERT_BUF_SANE(&enc->buf);
627 return enc;
628 }
629
630 /* clone an encoder without current state */
631 srl_encoder_t *
srl_build_encoder_struct_alike(pTHX_ srl_encoder_t * proto)632 srl_build_encoder_struct_alike(pTHX_ srl_encoder_t *proto)
633 {
634 srl_encoder_t *enc;
635 enc = srl_empty_encoder_struct(aTHX);
636
637 /* Copy the configuration-type, non-ephemeral attributes */
638 enc->flags = proto->flags;
639 enc->max_recursion_depth = proto->max_recursion_depth;
640 enc->compress_threshold = proto->compress_threshold;
641 if (expect_false(SRL_ENC_HAVE_OPTION(enc, SRL_F_ENABLE_FREEZE_SUPPORT))) {
642 enc->sereal_string_sv = newSVpvs("Sereal");
643 }
644 enc->protocol_version = proto->protocol_version;
645 enc->scratch_sv= newSViv(0);
646 DEBUG_ASSERT_BUF_SANE(&enc->buf);
647 return enc;
648 }
649
650 SRL_STATIC_INLINE PTABLE_t *
srl_init_string_hash(srl_encoder_t * enc)651 srl_init_string_hash(srl_encoder_t *enc)
652 {
653 enc->str_seenhash = PTABLE_new_size(4);
654 return enc->str_seenhash;
655 }
656
657 SRL_STATIC_INLINE PTABLE_t *
srl_init_ref_hash(srl_encoder_t * enc)658 srl_init_ref_hash(srl_encoder_t *enc)
659 {
660 enc->ref_seenhash = PTABLE_new_size(4);
661 return enc->ref_seenhash;
662 }
663
664 SRL_STATIC_INLINE PTABLE_t *
srl_init_weak_hash(srl_encoder_t * enc)665 srl_init_weak_hash(srl_encoder_t *enc)
666 {
667 enc->weak_seenhash = PTABLE_new_size(3);
668 return enc->weak_seenhash;
669 }
670
671 SRL_STATIC_INLINE PTABLE_t *
srl_init_freezeobj_svhash(srl_encoder_t * enc)672 srl_init_freezeobj_svhash(srl_encoder_t *enc)
673 {
674 enc->freezeobj_svhash = PTABLE_new_size(3);
675 return enc->freezeobj_svhash;
676 }
677
678 SRL_STATIC_INLINE HV *
srl_init_string_deduper_hv(pTHX_ srl_encoder_t * enc)679 srl_init_string_deduper_hv(pTHX_ srl_encoder_t *enc)
680 {
681 enc->string_deduper_hv = newHV();
682 return enc->string_deduper_hv;
683 }
684
685
686 void
srl_write_header(pTHX_ srl_encoder_t * enc,SV * user_header_src,const U32 compress_flags)687 srl_write_header(pTHX_ srl_encoder_t *enc, SV *user_header_src, const U32 compress_flags)
688 {
689 /* 4th to 8th bit are flags. Using 4th for snappy flag. FIXME needs to go in spec. */
690
691 U8 flags= srl_get_compression_header_flag(compress_flags);
692 const U8 version_and_flags = (U8)enc->protocol_version | flags;
693
694 /* 4 byte magic string + proto version
695 * + potentially uncompressed size varint
696 * + 1 byte varint that indicates zero-length header */
697 BUF_SIZE_ASSERT(&enc->buf, sizeof(SRL_MAGIC_STRING) + 1 + 1);
698 if (expect_true( enc->protocol_version > 2 ))
699 srl_buf_cat_str_s_nocheck(&enc->buf, SRL_MAGIC_STRING_HIGHBIT);
700 else
701 srl_buf_cat_str_s_nocheck(&enc->buf, SRL_MAGIC_STRING);
702 srl_buf_cat_char_nocheck(&enc->buf, version_and_flags);
703 if (user_header_src == NULL) {
704 srl_buf_cat_char_nocheck(&enc->buf, '\0'); /* variable header length (0 right now) */
705 }
706 else {
707 STRLEN user_data_len;
708
709 if (expect_false( enc->protocol_version < 2 ))
710 croak("Cannot serialize user header data in Sereal protocol V1 mode!");
711
712 /* Allocate tmp buffer for swapping if necessary,
713 * will be cleaned up automatically */
714 if (enc->tmp_buf.start == NULL)
715 srl_buf_init_buffer(aTHX_ &enc->tmp_buf, INITIALIZATION_SIZE);
716
717 /* Write document body (for header) into separate buffer */
718 srl_buf_swap_buffer(aTHX_ &enc->tmp_buf, &enc->buf);
719 SRL_ENC_UPDATE_BODY_POS(enc);
720 srl_dump_sv(aTHX_ enc, user_header_src);
721 srl_fixup_weakrefs(aTHX_ enc); /* more bodies to follow */
722 srl_clear_seen_hashes(aTHX_ enc); /* more bodies to follow */
723
724 /* Swap main buffer back in, encode header length&bitfield, copy user header data */
725 user_data_len = BUF_POS_OFS(&enc->buf);
726 srl_buf_swap_buffer(aTHX_ &enc->buf, &enc->tmp_buf);
727
728 BUF_SIZE_ASSERT(&enc->buf, user_data_len + 1 + SRL_MAX_VARINT_LENGTH); /* +1 for bit field, +X for header len */
729
730 /* Encode header length */
731 srl_buf_cat_varint_nocheck(aTHX_ &enc->buf, 0, (UV)(user_data_len + 1)); /* +1 for bit field */
732 /* Encode bitfield */
733 srl_buf_cat_char_nocheck(&enc->buf, '\1');
734 /* Copy user header data */
735 Copy(enc->tmp_buf.start, enc->buf.pos, user_data_len, char);
736 enc->buf.pos += user_data_len;
737
738 enc->tmp_buf.pos = enc->tmp_buf.start; /* reset tmp buffer just to be clean */
739 }
740 }
741
742 /* The following is to handle the fact that under normal build options
743 * VC6 will compare all floating point at 80 bits of precision, regardless
744 * regardless of the type.
745 * By setting the vars to "volatile" we avoid this behavior.
746 * Hopefully this fixes various remaining Win32 test failures we see.
747 *
748 * Note this patch could not have been written without Bulk88's help.
749 * Thanks a lot man!
750 *
751 * Comment from Bulk88:
752 * -O1 and -O2 tested and both of those 2 "failed"
753 * -Op - Improve Float Consistency does not have the bug
754 * Problem not seen in VC 2003
755 * I (Bulk88) don't have a VC 2002 to test v13 officially
756 *
757 */
758 #if defined(_MSC_VER)
759 # if _MSC_VER < 1300
760 # define MS_VC6_WORKAROUND_VOLATILE volatile
761 # else
762 # define MS_VC6_WORKAROUND_VOLATILE
763 # endif
764 #else
765 # define MS_VC6_WORKAROUND_VOLATILE
766 #endif
767
768
769 /* Code for serializing floats */
770 SRL_STATIC_INLINE void
srl_dump_nv(pTHX_ srl_encoder_t * enc,SV * src)771 srl_dump_nv(pTHX_ srl_encoder_t *enc, SV *src)
772 {
773 NV nv= SvNV(src);
774 MS_VC6_WORKAROUND_VOLATILE float f= (float)nv;
775 MS_VC6_WORKAROUND_VOLATILE double d= (double)nv;
776 /* TODO: this logic could be reworked to not duplicate so much code, which will help on win32 */
777 if ( f == nv || nv != nv ) {
778 BUF_SIZE_ASSERT(&enc->buf, 1 + sizeof(f)); /* heuristic: header + string + simple value */
779 srl_buf_cat_char_nocheck(&enc->buf, SRL_HDR_FLOAT);
780 Copy((char *)&f, enc->buf.pos, sizeof(f), char);
781 enc->buf.pos += sizeof(f);
782 } else if (d == nv) {
783 BUF_SIZE_ASSERT(&enc->buf, 1 + sizeof(d)); /* heuristic: header + string + simple value */
784 srl_buf_cat_char_nocheck(&enc->buf, SRL_HDR_DOUBLE);
785 Copy((char *)&d, enc->buf.pos, sizeof(d), char);
786 enc->buf.pos += sizeof(d);
787 } else {
788 BUF_SIZE_ASSERT(&enc->buf, 1 + sizeof(nv)); /* heuristic: header + string + simple value */
789 srl_buf_cat_char_nocheck(&enc->buf, SRL_HDR_LONG_DOUBLE);
790 Copy((char *)&nv, enc->buf.pos, sizeof(nv), char);
791 #if SRL_EXTENDED_PRECISION_LONG_DOUBLE
792 /* x86 uses an 80 bit extended precision. on 64 bit machines
793 * this is 16 bytes long, and on 32 bits its is 12 bytes long.
794 * the unused 2/6 bytes are not necessarily zeroed, potentially
795 * allowing internal memory to be exposed. We therefore zero
796 * the unused bytes here. */
797 memset(enc->buf.pos+10, 0, sizeof(nv) - 10);
798 #endif
799 enc->buf.pos += sizeof(nv);
800 }
801 }
802
803
804 /* Code for serializing any SINGLE integer type */
805 SRL_STATIC_INLINE void
srl_dump_ivuv(pTHX_ srl_encoder_t * enc,SV * src)806 srl_dump_ivuv(pTHX_ srl_encoder_t *enc, SV *src)
807 {
808 char hdr;
809 /* TODO for the time being, we just won't ever use NUMLIST types because that's
810 * a fair amount of extra implementation work. The decoders won't care and
811 * we're just wasting some space. */
812 /* TODO optimize! */
813
814 /* FIXME find a way to express the condition without repeated SvIV/SvUV */
815 if (expect_true( SvIOK_UV(src) || SvIV(src) >= 0 )) {
816 const UV num = SvUV(src); /* FIXME is SvUV_nomg good enough because of the GET magic in dump_sv? SvUVX after having checked the flags? */
817 if (num <= 15) {
818 /* encodable as POS */
819 hdr = SRL_HDR_POS_LOW | (unsigned char)num;
820 srl_buf_cat_char(&enc->buf, hdr);
821 }
822 else {
823 srl_buf_cat_varint(aTHX_ &enc->buf, SRL_HDR_VARINT, num);
824 }
825 }
826 else {
827 const IV num = SvIV(src);
828 if (num >= -16) {
829 /* encodable as NEG */
830 hdr = SRL_HDR_NEG_LOW | ((unsigned char)num + 32);
831 srl_buf_cat_char(&enc->buf, hdr);
832 }
833 else {
834 /* Needs ZIGZAG */
835 srl_buf_cat_zigzag(aTHX_ &enc->buf, SRL_HDR_ZIGZAG, num);
836 }
837 }
838 }
839
840 /* Dumps the tag and class name of an object doing all necessary callbacks or
841 * exception-throwing.
842 * The provided SV must already have been identified as a Perl object
843 * using sv_isobject().
844 * If the return value is not NULL, then it's the actual object content that
845 * needs to be serialized by the caller. */
846 SRL_STATIC_INLINE SV *
srl_get_frozen_object(pTHX_ srl_encoder_t * enc,SV * src,SV * referent)847 srl_get_frozen_object(pTHX_ srl_encoder_t *enc, SV *src, SV *referent)
848 {
849 assert(sv_isobject(src)); /* duplicate asserts are "free" */
850
851 /* Check for FREEZE support */
852 if (expect_false( SRL_ENC_HAVE_OPTION(enc, SRL_F_ENABLE_FREEZE_SUPPORT) )) {
853 HV *stash = SvSTASH(referent);
854 GV *method = NULL;
855 assert(stash != NULL);
856 method = gv_fetchmethod_autoload(stash, "FREEZE", 0);
857
858 if (expect_false( method != NULL )) {
859 SV *replacement= NULL;
860 PTABLE_t *freezeobj_svhash = SRL_GET_FREEZEOBJ_SVHASH(enc);
861 if (SvREFCNT(referent)>1) {
862 replacement= (SV *) PTABLE_fetch(freezeobj_svhash, referent);
863 }
864 if (!replacement) {
865 int count;
866 dSP;
867 ENTER;
868 SAVETMPS;
869 PUSHMARK(SP);
870
871 EXTEND(SP, 2);
872 PUSHs(src);
873 PUSHs(enc->sereal_string_sv); /* not NULL if SRL_F_ENABLE_FREEZE_SUPPORT is set */
874 replacement= (SV*)newAV();
875 PTABLE_store(freezeobj_svhash, referent, replacement);
876
877 PUTBACK;
878 count = call_sv((SV *)GvCV(method), G_ARRAY);
879 /* TODO explore method lookup caching */
880 SPAGAIN;
881
882 while ( count-- > 0) {
883 SV *tmp = POPs;
884 SvREFCNT_inc(tmp);
885 if (!av_store((AV*)replacement,count,tmp))
886 croak("Failed to push value into array");
887 }
888
889 PUTBACK;
890 FREETMPS;
891 LEAVE;
892 }
893 return replacement;
894 }
895 }
896 return NULL;
897
898 }
899
900 /* Outputs a bless header and the class name (as some form of string or COPY).
901 * Caller then has to output the actual reference payload.
902 * If it returns 1 it means the classname was written out and should NOT
903 * be overwritten by the ref rewrite logic (which handles REFP).
904 * If it returns 0 it means no classname was output. */
905 SRL_STATIC_INLINE int
srl_dump_classname(pTHX_ srl_encoder_t * enc,SV * referent,SV * replacement)906 srl_dump_classname(pTHX_ srl_encoder_t *enc, SV *referent, SV *replacement)
907 {
908 /* Check that we actually want to support objects */
909 if (expect_false( SRL_ENC_HAVE_OPTION(enc, SRL_F_CROAK_ON_BLESS)) ) {
910 croak("Attempted to serialize blessed reference. Serializing objects "
911 "using Sereal::Encoder was explicitly disabled using the "
912 "'croak_on_bless' option.");
913 } else if (expect_false( SRL_ENC_HAVE_OPTION(enc, SRL_F_NO_BLESS_OBJECTS) )) {
914 return 0;
915 } else {
916 const HV *stash = SvSTASH(referent);
917 PTABLE_t *string_seenhash = SRL_GET_STR_PTR_SEENHASH(enc);
918 svtype svt= SvTYPE(referent);
919 int is_av_or_hv= (svt == SVt_PVAV || svt== SVt_PVHV);
920 ptrdiff_t oldoffset= is_av_or_hv
921 ? 0
922 : (ptrdiff_t)PTABLE_fetch(string_seenhash, referent);
923
924 if (oldoffset) {
925 return 0;
926 } else {
927 svt= replacement ? SvTYPE(replacement) : SvTYPE(referent);
928 if (SRL_UNSUPPORTED_SvTYPE(svt)) {
929 return 0;
930 }
931 oldoffset= (ptrdiff_t)PTABLE_fetch(string_seenhash, (SV *)stash);
932 }
933
934 if (oldoffset != 0) {
935 /* Issue COPY instead of literal class name string */
936 srl_buf_cat_varint(aTHX_ &enc->buf,
937 expect_false(replacement) ? SRL_HDR_OBJECTV_FREEZE : SRL_HDR_OBJECTV,
938 (UV)oldoffset);
939 }
940 else {
941 const char *class_name = HvNAME_get(stash);
942 const size_t len = HvNAMELEN_get(stash);
943
944 /* First save this new string (well, the HV * that it is represented by) into the string
945 * dedupe table.
946 * By saving the ptr to the HV, we only dedupe class names with class names, though
947 * this seems a small price to pay for not having to keep a full string table.
948 * At least, we can safely use the same PTABLE to store the ptrs to hashkeys since
949 * the set of pointers will never collide.
950 * /me bows to Yves for the delightfully evil hack. */
951 srl_buf_cat_char(&enc->buf, expect_false(replacement) ? SRL_HDR_OBJECT_FREEZE : SRL_HDR_OBJECT);
952
953 /* remember current offset before advancing it */
954 PTABLE_store(string_seenhash, (void *)stash, INT2PTR(void *, BODY_POS_OFS(&enc->buf)));
955
956 /* HvNAMEUTF8 not in older perls and it would be 0 for those anyway */
957 #if PERL_VERSION >= 16
958 srl_dump_pv(aTHX_ enc, class_name, len, HvNAMEUTF8(stash));
959 #else
960 srl_dump_pv(aTHX_ enc, class_name, len, 0);
961 #endif
962 }
963 if (is_av_or_hv) {
964 return 0;
965 } else {
966 /* use the string_seenhash to track which items we have seen before */
967 PTABLE_store(string_seenhash, (void *)referent, INT2PTR(void *, BODY_POS_OFS(&enc->buf)));
968 return 1;
969 }
970 }
971 return 0;
972 }
973
974
975 /* Prepare encoder for encoding: Clone if already in use since
976 * encoders aren't "reentrant". Set as in use and register cleanup
977 * routine with Perl. */
978 SRL_STATIC_INLINE srl_encoder_t *
srl_prepare_encoder(pTHX_ srl_encoder_t * enc)979 srl_prepare_encoder(pTHX_ srl_encoder_t *enc)
980 {
981 /* Check whether encoder is in use and create a new one on the
982 * fly if necessary. Should only happen in edge cases such as
983 * FREEZE hooks that serialize things using the same encoder
984 * object. */
985 if (SRL_ENC_HAVE_OPER_FLAG(enc, SRL_OF_ENCODER_DIRTY)) {
986 srl_encoder_t * const proto = enc;
987 enc = srl_build_encoder_struct_alike(aTHX_ proto);
988 SRL_ENC_RESET_OPTION(enc, SRL_F_REUSE_ENCODER);
989 }
990 /* Set to being in use */;
991 SRL_ENC_SET_OPER_FLAG(enc, SRL_OF_ENCODER_DIRTY);
992
993 /* Register our structure for destruction on scope exit */
994 SAVEDESTRUCTOR_X(&srl_destructor_hook, (void *)enc);
995
996 return enc;
997 }
998
999 SRL_STATIC_INLINE srl_encoder_t *
srl_dump_data_structure(pTHX_ srl_encoder_t * enc,SV * src,SV * user_header_src)1000 srl_dump_data_structure(pTHX_ srl_encoder_t *enc, SV *src, SV *user_header_src)
1001 {
1002 U32 compress_flags;
1003
1004 enc = srl_prepare_encoder(aTHX_ enc);
1005 compress_flags= SRL_ENC_HAVE_OPTION(enc, SRL_F_COMPRESS_FLAGS_MASK);
1006
1007 if (expect_false(compress_flags))
1008 { /* Have some sort of compression */
1009 ptrdiff_t sereal_header_len;
1010 STRLEN uncompressed_body_length;
1011 const STRLEN max_len = (1L << 32) - 1;
1012
1013 /* Alas, have to write entire packet first since the header length
1014 * will determine offsets. */
1015 srl_write_header(aTHX_ enc, user_header_src, compress_flags);
1016 sereal_header_len = BUF_POS_OFS(&enc->buf);
1017 SRL_ENC_UPDATE_BODY_POS(enc);
1018 srl_dump_sv(aTHX_ enc, src);
1019 srl_fixup_weakrefs(aTHX_ enc);
1020 assert(BUF_POS_OFS(&enc->buf) > sereal_header_len);
1021 uncompressed_body_length = BUF_POS_OFS(&enc->buf) - sereal_header_len;
1022
1023 if ((uncompressed_body_length < (STRLEN)enc->compress_threshold) || uncompressed_body_length > max_len) {
1024 if (uncompressed_body_length > max_len) {
1025 /* we dont support SNAPPY on super long buffers, it has a 2**32 limit
1026 * and we currently don't support splitting things up. See Issue #88 */
1027 warn("disabling SNAPPY compression as buffer is too large!");
1028 }
1029 /* Don't bother with compression at all if we have less than $threshold bytes of payload */
1030 srl_reset_compression_header_flag(&enc->buf);
1031 }
1032 else { /* Do Snappy or zlib compression of body */
1033 srl_compress_body(aTHX_ &enc->buf, sereal_header_len,
1034 compress_flags, enc->compress_level,
1035 &enc->snappy_workmem);
1036
1037 SRL_ENC_UPDATE_BODY_POS(enc);
1038 DEBUG_ASSERT_BUF_SANE(&enc->buf);
1039 }
1040 } /* End of "want compression?" */
1041 else
1042 {
1043 srl_write_header(aTHX_ enc, user_header_src, compress_flags);
1044 SRL_ENC_UPDATE_BODY_POS(enc);
1045 srl_dump_sv(aTHX_ enc, src);
1046 srl_fixup_weakrefs(aTHX_ enc);
1047 }
1048
1049 /* NOT doing a
1050 * SRL_ENC_RESET_OPER_FLAG(enc, SRL_OF_ENCODER_DIRTY);
1051 * here because we're relying on the SAVEDESTRUCTOR_X call. */
1052 return enc;
1053 }
1054
1055 SV *
srl_dump_data_structure_mortal_sv(pTHX_ srl_encoder_t * enc,SV * src,SV * user_header_src,const U32 flags)1056 srl_dump_data_structure_mortal_sv(pTHX_ srl_encoder_t *enc, SV *src, SV *user_header_src, const U32 flags)
1057 {
1058 assert(enc);
1059 enc = srl_dump_data_structure(aTHX_ enc, src, user_header_src);
1060 assert(enc->buf.start && enc->buf.pos && enc->buf.pos > enc->buf.start);
1061
1062 if ( flags && /* for now simpler and equivalent to: flags == SRL_ENC_SV_REUSE_MAYBE */
1063 (BUF_POS_OFS(&enc->buf) > 20 && BUF_SPACE(&enc->buf) < BUF_POS_OFS(&enc->buf) )
1064 ){
1065 /* If not wasting more than 2x memory - FIXME fungible */
1066 SV *sv = sv_2mortal(newSV_type(SVt_PV));
1067 SvPV_set(sv, (char *) enc->buf.start);
1068 SvLEN_set(sv, BUF_SIZE(&enc->buf));
1069 SvCUR_set(sv, BUF_POS_OFS(&enc->buf));
1070 SvPOK_on(sv);
1071 enc->buf.start = enc->buf.pos = NULL; /* no need to free these guys now */
1072 return sv;
1073 }
1074
1075 return sv_2mortal(newSVpvn((char *)enc->buf.start, (STRLEN)BUF_POS_OFS(&enc->buf)));
1076 }
1077
1078 SRL_STATIC_INLINE void
srl_fixup_weakrefs(pTHX_ srl_encoder_t * enc)1079 srl_fixup_weakrefs(pTHX_ srl_encoder_t *enc)
1080 {
1081 PTABLE_t *weak_seenhash = SRL_GET_WEAK_SEENHASH_OR_NULL(enc);
1082 if (!weak_seenhash)
1083 return;
1084
1085 {
1086 PTABLE_ITER_t *it = PTABLE_iter_new(weak_seenhash);
1087 PTABLE_ENTRY_t *ent;
1088
1089 /* we now walk the weak_seenhash and set any tags it points
1090 * at to the PAD opcode, this basically turns the first weakref
1091 * we encountered into a normal ref when there is only a weakref
1092 * pointing at the structure. */
1093 while ( NULL != (ent = PTABLE_iter_next(it)) ) {
1094 const ptrdiff_t offset = (ptrdiff_t)ent->value;
1095 if ( offset ) {
1096 srl_buffer_char *pos = enc->buf.body_pos + offset;
1097 assert(*pos == SRL_HDR_WEAKEN);
1098 if (DEBUGHACK) warn("setting byte at offset %"UVuf" to PAD", (UV)offset);
1099 *pos = SRL_HDR_PAD;
1100 }
1101 }
1102
1103 PTABLE_iter_free(it);
1104 }
1105 }
1106
1107
1108
1109 static inline void
srl_dump_regexp(pTHX_ srl_encoder_t * enc,SV * sv)1110 srl_dump_regexp(pTHX_ srl_encoder_t *enc, SV *sv)
1111 {
1112 STRLEN left = 0;
1113 const char *fptr;
1114 char ch;
1115 U16 match_flags;
1116 #ifdef MODERN_REGEXP
1117 REGEXP *re= SvRX(sv);
1118 #else
1119 regexp *re = (regexp *)(((MAGIC*)sv)->mg_obj);
1120 #endif
1121
1122 char reflags[sizeof(INT_PAT_MODS) + MAX_CHARSET_NAME_LENGTH];
1123
1124 /*
1125 we are in list context so stringify
1126 the modifiers that apply. We ignore "negative
1127 modifiers" in this scenario, and the default character set
1128 */
1129
1130 #ifdef REGEXP_DEPENDS_CHARSET
1131 if (get_regex_charset(RX_EXTFLAGS(re)) != REGEX_DEPENDS_CHARSET) {
1132 STRLEN len;
1133 const char* const name = get_regex_charset_name(RX_EXTFLAGS(re),
1134 &len);
1135 Copy(name, reflags + left, len, char);
1136 left += len;
1137 }
1138 #endif
1139 fptr = INT_PAT_MODS;
1140 match_flags = (U16)((RX_EXTFLAGS(re) & RXf_PMf_COMPILETIME)
1141 >> RXf_PMf_STD_PMMOD_SHIFT);
1142
1143 while((ch = *fptr++)) {
1144 if(match_flags & 1) {
1145 reflags[left++] = ch;
1146 }
1147 match_flags >>= 1;
1148 }
1149
1150 srl_buf_cat_char(&enc->buf, SRL_HDR_REGEXP);
1151 srl_dump_pv(aTHX_ enc, RX_PRECOMP(re),RX_PRELEN(re), (RX_UTF8(re) ? SVf_UTF8 : 0));
1152 srl_dump_pv(aTHX_ enc, reflags, left, 0);
1153 return;
1154 }
1155
1156 #define ASSUME_BYTES_PER_TAG 4
1157 #define BUF_SIZE_ASSERT_AV(b,n) \
1158 BUF_SIZE_ASSERT((b), 2 + SRL_MAX_VARINT_LENGTH + (1 * ASSUME_BYTES_PER_TAG * (n) ) )
1159 /* heuristic: 6 * n = liberal estimate of min size of n hashkeys */
1160 #define BUF_SIZE_ASSERT_HV(b, n) \
1161 BUF_SIZE_ASSERT((b), 2 + SRL_MAX_VARINT_LENGTH + (2 * ASSUME_BYTES_PER_TAG * (n) ) )
1162
1163 SRL_STATIC_INLINE void
srl_dump_av(pTHX_ srl_encoder_t * enc,AV * src,U32 refcount)1164 srl_dump_av(pTHX_ srl_encoder_t *enc, AV *src, U32 refcount)
1165 {
1166 UV n;
1167 SV **svp;
1168
1169 n = av_len(src)+1;
1170
1171 /* heuristic: n is virtually the min. size of any element */
1172 BUF_SIZE_ASSERT_AV(&enc->buf, n);
1173
1174 if (n < 16 && refcount == 1 && !SRL_ENC_HAVE_OPTION(enc,SRL_F_CANONICAL_REFS)) {
1175 enc->buf.pos--; /* backup over previous REFN */
1176 srl_buf_cat_char_nocheck(&enc->buf, SRL_HDR_ARRAYREF + n);
1177 } else {
1178 /* header and num. elements */
1179 srl_buf_cat_varint_nocheck(aTHX_ &enc->buf, SRL_HDR_ARRAY, n);
1180 }
1181 if (!n)
1182 return;
1183 /* I can't decide if this should make me feel dirty */
1184 if (SvMAGICAL(src)) {
1185 UV i;
1186 for (i = 0; i < n; ++i) {
1187 svp = av_fetch(src, i, 0);
1188 CALL_SRL_DUMP_SVP(enc, svp);
1189 }
1190 } else {
1191 SV **end;
1192 svp= AvARRAY(src);
1193 end= svp + n;
1194 for ( ; svp < end ; svp++) {
1195 /* we cannot have a null *svp so we do not use CALL_SRL_DUMP_SVP() here */
1196 CALL_SRL_DUMP_SV(enc, *svp);
1197 }
1198 }
1199 }
1200
1201 SRL_STATIC_INLINE void
srl_dump_hv_unsorted_nomg(pTHX_ srl_encoder_t * enc,HV * src,UV n)1202 srl_dump_hv_unsorted_nomg(pTHX_ srl_encoder_t *enc, HV *src, UV n)
1203 {
1204 HE *he;
1205 const int do_share_keys = HvSHAREKEYS((SV *)src);
1206 HE **he_ptr= HvARRAY(src);
1207 HE **he_end= he_ptr + HvMAX(src) + 1;
1208
1209 do {
1210 for (he= *he_ptr++; he; he= HeNEXT(he) ) {
1211 SV *v= HeVAL(he);
1212 if (v != &PL_sv_placeholder) {
1213 srl_dump_hk(aTHX_ enc, he, do_share_keys);
1214 CALL_SRL_DUMP_SV(enc, v);
1215 if (--n == 0) {
1216 he_ptr= he_end;
1217 break;
1218 }
1219 }
1220 }
1221 } while ( he_ptr < he_end );
1222 }
1223
1224 SRL_STATIC_INLINE void
srl_dump_hv_unsorted_mg(pTHX_ srl_encoder_t * enc,HV * src,const UV n)1225 srl_dump_hv_unsorted_mg(pTHX_ srl_encoder_t *enc, HV *src, const UV n)
1226 {
1227 HE *he;
1228 UV i= 0;
1229 const int do_share_keys = HvSHAREKEYS((SV *)src);
1230
1231 (void)hv_iterinit(src); /* return value not reliable according to API docs */
1232 while ((he = hv_iternext(src))) {
1233 SV *v;
1234 if (expect_false( i == n ))
1235 croak("Panic: cannot serialize a tied hash which changes its size!");
1236 v= hv_iterval(src, he);
1237 srl_dump_hk(aTHX_ enc, he, do_share_keys);
1238 CALL_SRL_DUMP_SV(enc, v);
1239 ++i;
1240 }
1241 if (expect_false( i != n ))
1242 croak("Panic: cannot serialize a tied hash which changes its size!");
1243 }
1244
1245 /* sorting hashes - nothing in perl is easy. ever.
1246 *
1247 * Some things to keep in mind about perl hashes as you read this code:
1248 *
1249 * Hashes may be shared or not. Usually shared. This means they share their
1250 * key data via PL_strtab.
1251 *
1252 * Hashes may be tied or not. Usually not. When tied the keys from the hash
1253 * are available only as SV *'s, and when untied, the keys from the hash are
1254 * accessed via HE *'s.
1255 *
1256 * Some HE's actually contains SV's but most contain a ptr/len combo with
1257 * an utf8 flag. To make things even more interesting utf8 keys are
1258 * normalized to latin1 by perl where possible before being stored in the HE,
1259 * with the utf8 flag indicating "was utf8" instead of "is utf8" or "not utf8".
1260 *
1261 * The complexity about accessing the key for a hash can be managed away by
1262 * perl via API's like hv_iterkeysv(), but using that means constructing mortal
1263 * SV's for each key as we go.
1264 *
1265 * We could in theory use the HePV() interface, but one annoying result of the
1266 * "was utf8" logic is that we can't use a sort comparator which looks
1267 * at the raw binary of the keys when the keys might contain utf8. A utf8 key
1268 * like "\xDF" will be downgraded to ascii in the HE form, but will be upgraded
1269 * to the utf8 representation in the SV form. So if we want to do "fast" sorting
1270 * we have to restrict it to non-utf8/non-sv keys, and force the use of the SV
1271 * based API (which we have to use for tie's anyway) when we see a UTF8 key.
1272 *
1273 * Which is what we do below. In order to sort a hash we need to construct an
1274 * array of its contents, in srl_dump_sorted_nomg() we walk the hash, checking
1275 * each key, and copying each HE over into a scratch buffer which it then sorts.
1276 * If during the transcription process it sees any utf8 or SV keys it exits
1277 * immediately, and falls through to srl_dump_sort_mg(), which uses hv_iterkeysv()
1278 * to construct an array of HE_SV instead, which we then sort.
1279 */
1280
1281
1282
1283 SRL_STATIC_INLINE int
he_islt(const HE * a,const HE * b)1284 he_islt(const HE *a, const HE *b)
1285 {
1286 /* no need for a dTHX here, we don't use anything that needs it */
1287 const STRLEN la = HeKLEN(a);
1288 const STRLEN lb = HeKLEN(b);
1289 const int cmp = memcmp(HeKEY(a), HeKEY(b), la < lb ? la : lb);
1290 if (cmp) {
1291 return cmp < 0;
1292 } else {
1293 return la < lb;
1294 }
1295 }
1296
1297 SRL_STATIC_INLINE int
he_sv_islt_fast(const HE_SV * a,const HE_SV * b)1298 he_sv_islt_fast(const HE_SV *a, const HE_SV *b)
1299 {
1300 /* no need for a dTHX here, we don't use anything that needs it */
1301 char *a_ptr;
1302 char *b_ptr;
1303 int a_isutf8;
1304 int b_isutf8;
1305 const STRLEN a_len= a->key.sv ? SvCUR(a->key.sv) : HeKLEN(a->val.he);
1306 const STRLEN b_len= b->key.sv ? SvCUR(b->key.sv) : HeKLEN(b->val.he);
1307 if (a_len != b_len) {
1308 return a_len < b_len;
1309 }
1310 a_isutf8= (a->key.sv ? SvUTF8(a->key.sv) : HeKUTF8(a->val.he)) ? 0 : 1;
1311 b_isutf8= (b->key.sv ? SvUTF8(b->key.sv) : HeKUTF8(b->val.he)) ? 0 : 1;
1312 if (a_isutf8 != b_isutf8) {
1313 return a_isutf8 < b_isutf8;
1314 }
1315 a_ptr= a->key.sv ? SvPVX(a->key.sv) : HeKEY(a->val.he);
1316 b_ptr= b->key.sv ? SvPVX(b->key.sv) : HeKEY(b->val.he);
1317 return memcmp(a_ptr, b_ptr, a_len < b_len ? a_len : b_len ) < 0;
1318 }
1319
1320 #define ISLT_HE_SV(a,b) he_sv_islt_fast( a, b )
1321 #define ISLT_SV_CMP(a,b) sv_cmp(a->key.sv, b->key.sv) == sort_dir
1322
1323
1324 SRL_STATIC_INLINE void
srl_qsort(pTHX_ srl_encoder_t * enc,const UV n,HE_SV * array)1325 srl_qsort(pTHX_ srl_encoder_t *enc, const UV n, HE_SV *array)
1326 {
1327 if ( SRL_ENC_HAVE_OPTION(enc, SRL_F_SORT_KEYS_PERL) ) {
1328 int sort_dir= SRL_ENC_HAVE_OPTION(enc, SRL_F_SORT_KEYS_PERL_REV) ? 1 : -1;
1329 /* hack to forcefully disable "use bytes" */
1330 COP cop= *PL_curcop;
1331 cop.op_private= 0;
1332
1333 ENTER;
1334 SAVETMPS;
1335
1336 SAVEVPTR (PL_curcop);
1337 PL_curcop= &cop;
1338
1339 /* now sort */
1340 QSORT(HE_SV, array, n, ISLT_SV_CMP);
1341
1342 FREETMPS;
1343 LEAVE;
1344 } else {
1345 /* now sort */
1346 QSORT(HE_SV, array, n, ISLT_HE_SV);
1347 }
1348 }
1349
1350
1351 SRL_STATIC_INLINE void
srl_dump_hv_sorted_sv_slow(pTHX_ srl_encoder_t * enc,HV * src,const UV n,HE_SV * array)1352 srl_dump_hv_sorted_sv_slow(pTHX_ srl_encoder_t *enc, HV *src, const UV n, HE_SV *array)
1353 {
1354 HE *he;
1355 UV i= 0;
1356 const int do_share_keys = HvSHAREKEYS((SV *)src);
1357 const int is_tie= !array;
1358
1359 /* This sub is used for ties, and for hashes with SV keys in them,
1360 * and when the user requests SORT_KEYS_PERL, it is the slowest way
1361 * and most memory hungry way to serialize a hash. We will use the
1362 * full perl api for extracting the contents of the hash, which fortifies
1363 * us against ties, and we will convert all keys into mortal
1364 * sv's where necessary. This means we can use sv_cmp on the keys
1365 * if we wish.
1366 */
1367
1368 (void)hv_iterinit(src); /* return value not reliable according to API docs */
1369 {
1370 HE_SV *array_end;
1371 if (!array) {
1372 Newx(array, n, HE_SV);
1373 SAVEFREEPV(array);
1374 }
1375 array_end= array + n;
1376 while ((he = hv_iternext(src))) {
1377 if (expect_false( i == n ))
1378 croak("Panic: cannot serialize a %s hash which changes its size!",is_tie ? "tied" : "untied");
1379 array[i].key.sv= hv_iterkeysv(he);
1380 array[i].val.sv= hv_iterval(src,he);
1381 i++;
1382 }
1383 if (expect_false( i != n ))
1384 croak("Panic: can not serialize a %s hash which changes it size!", is_tie ? "tied" : "untied");
1385
1386 srl_qsort(aTHX_ enc, n, array);
1387
1388 while ( array < array_end ) {
1389 CALL_SRL_DUMP_SV(enc, array->key.sv);
1390 CALL_SRL_DUMP_SV(enc, array->val.sv);
1391 array++;
1392 }
1393 }
1394 }
1395
1396
1397 SRL_STATIC_INLINE void
srl_dump_hv_sorted_nomg(pTHX_ srl_encoder_t * enc,HV * src,const UV n)1398 srl_dump_hv_sorted_nomg(pTHX_ srl_encoder_t *enc, HV *src, const UV n)
1399 {
1400 HE *he;
1401 const int do_share_keys = HvSHAREKEYS((SV *)src);
1402
1403 /* This sub is used only for untied hashes and when the user wants
1404 * sorted keys, but not necessarily the order that perl would use.
1405 */
1406
1407 (void)hv_iterinit(src); /* return value not reliable according to API docs */
1408 {
1409 HE_SV *array;
1410 HE_SV *array_ptr;
1411 HE_SV *array_end;
1412 Newx(array, n, HE_SV);
1413 SAVEFREEPV(array);
1414 array_ptr = array;
1415 while ((he = hv_iternext(src))) {
1416 if ( HeKWASUTF8(he) ) {
1417 array_ptr->key.sv= hv_iterkeysv(he);
1418 } else {
1419 array_ptr->key.sv = HeSVKEY(he);
1420 }
1421 array_ptr->val.he = he;
1422 array_ptr++;
1423 }
1424
1425 srl_qsort(aTHX_ enc, n, array);
1426
1427 array_end = array + n;
1428 for ( array_end= array + n; array < array_end; array++ ) {
1429 SV *v;
1430 he = array->val.he;
1431 v = hv_iterval(src, he);
1432 srl_dump_hk(aTHX_ enc, he, do_share_keys);
1433 CALL_SRL_DUMP_SV(enc, v);
1434 }
1435 }
1436 }
1437
1438 SRL_STATIC_INLINE void
srl_dump_hv(pTHX_ srl_encoder_t * enc,HV * src,U32 refcount)1439 srl_dump_hv(pTHX_ srl_encoder_t *enc, HV *src, U32 refcount)
1440 {
1441 HE *he;
1442 UV n;
1443 if ( SvMAGICAL(src) ) {
1444 /* for tied hashes, we have to iterate to find the number of entries. Alas... */
1445 n= 0;
1446 (void)hv_iterinit(src); /* return value not reliable according to API docs */
1447 while ((he = hv_iternext(src))) { ++n; }
1448 }
1449 else {
1450 n= HvUSEDKEYS(src);
1451 }
1452
1453 BUF_SIZE_ASSERT_HV(&enc->buf, n);
1454 if (n < 16 && refcount == 1 && !SRL_ENC_HAVE_OPTION(enc,SRL_F_CANONICAL_REFS)) {
1455 enc->buf.pos--; /* backup over the previous REFN */
1456 srl_buf_cat_char_nocheck(&enc->buf, SRL_HDR_HASHREF + n);
1457 } else {
1458 srl_buf_cat_varint_nocheck(aTHX_ &enc->buf, SRL_HDR_HASH, n);
1459 }
1460
1461 if ( n ) {
1462 if ( SvMAGICAL(src) || SRL_ENC_HAVE_OPTION(enc, SRL_F_SORT_KEYS_PERL) ) {
1463 /* SORT_KEYS_PERL implies SORT_KEYS, but we check for either just to be
1464 * careful - yves*/
1465 if ( SRL_ENC_HAVE_OPTION(enc, SRL_F_SORT_KEYS|SRL_F_SORT_KEYS_PERL) ) {
1466 srl_dump_hv_sorted_sv_slow(aTHX_ enc, src, n, NULL);
1467 }
1468 else {
1469 srl_dump_hv_unsorted_mg(aTHX_ enc, src, n);
1470 }
1471 }
1472 else {
1473 if ( SRL_ENC_HAVE_OPTION(enc, SRL_F_SORT_KEYS) ) {
1474 srl_dump_hv_sorted_nomg(aTHX_ enc, src, n);
1475 }
1476 else {
1477 srl_dump_hv_unsorted_nomg(aTHX_ enc, src, n);
1478 }
1479 }
1480 }
1481 }
1482
1483
1484
1485 SRL_STATIC_INLINE void
srl_dump_hk(pTHX_ srl_encoder_t * enc,HE * src,const int share_keys)1486 srl_dump_hk(pTHX_ srl_encoder_t *enc, HE *src, const int share_keys)
1487 {
1488 char *str;
1489 STRLEN len;
1490 char mode;
1491
1492 if (HeKLEN(src) == HEf_SVKEY) {
1493 SV *sv = HeSVKEY(src);
1494
1495 SvGETMAGIC(sv);
1496 str = SvPV(sv, len);
1497 mode= SvUTF8(sv) ? 1 : 0;
1498
1499 }
1500 else {
1501 str = HeKEY(src);
1502 /* This logic is an optimization for output space: We keep track of
1503 * all seen hash key strings that are in perl's shared string storage.
1504 * If we see one again, we just emit a COPY instruction.
1505 * This means that we only need to keep a ptr table since the strings
1506 * don't move in the shared key storage -- otherwise, we'd have to
1507 * compare strings / keep a full string hash table. */
1508 if ( share_keys && SRL_ENC_HAVE_OPTION(enc, SRL_F_SHARED_HASHKEYS) /* only enter branch if shared hk's enabled */
1509 #if PERL_VERSION >= 10
1510 && (!DO_SHARED_HASH_ENTRY_REFCOUNT_CHECK
1511 || src->he_valu.hent_refcount > 1)
1512 #endif
1513 )
1514 {
1515 PTABLE_t *string_seenhash = SRL_GET_STR_PTR_SEENHASH(enc);
1516 const ptrdiff_t oldoffset = (ptrdiff_t)PTABLE_fetch(string_seenhash, str);
1517 if (oldoffset != 0) {
1518 /* Issue COPY instead of literal hash key string */
1519 srl_buf_cat_varint(aTHX_ &enc->buf, SRL_HDR_COPY, (UV)oldoffset);
1520 return;
1521 }
1522 else {
1523 /* remember current offset before advancing it */
1524 const ptrdiff_t newoffset = BODY_POS_OFS(&enc->buf);
1525 PTABLE_store(string_seenhash, (void *)str, INT2PTR(void *, newoffset));
1526 }
1527 }
1528 len= HeKLEN(src);
1529 mode= HeKWASUTF8(src) ? 2 : HeKUTF8(src) ? 1 : 0;
1530 }
1531 if (mode == 2) { /* must convert back to utf8 */
1532 char* utf8= (char *)Perl_bytes_to_utf8(aTHX_ (U8 *)str, &len);
1533 srl_dump_pv(aTHX_ enc, utf8, len, 1);
1534 Safefree(utf8);
1535 } else {
1536 srl_dump_pv(aTHX_ enc, str, len, mode);
1537 }
1538 }
1539
1540 SRL_STATIC_INLINE void
srl_dump_svpv(pTHX_ srl_encoder_t * enc,SV * src)1541 srl_dump_svpv(pTHX_ srl_encoder_t *enc, SV *src)
1542 {
1543 STRLEN len;
1544 const char * const str= SvPV(src, len);
1545 if ( SRL_ENC_HAVE_OPTION(enc, SRL_F_DEDUPE_STRINGS) && len > 3 ) {
1546 HV *string_deduper_hv= SRL_GET_STR_DEDUPER_HV(enc);
1547 HE *dupe_offset_he= hv_fetch_ent(string_deduper_hv, src, 1, 0);
1548 if (!dupe_offset_he) {
1549 croak("out of memory (hv_fetch_ent returned NULL)");
1550 } else {
1551 const char out_tag= SRL_ENC_HAVE_OPTION(enc, SRL_F_ALIASED_DEDUPE_STRINGS)
1552 ? SRL_HDR_ALIAS
1553 : SRL_HDR_COPY;
1554 SV *ofs_sv= HeVAL(dupe_offset_he);
1555 if (SvIOK(ofs_sv)) {
1556 /* emit copy or alias */
1557 if (out_tag == SRL_HDR_ALIAS)
1558 SRL_SET_TRACK_FLAG(*(enc->buf.body_pos + SvUV(ofs_sv)));
1559 srl_buf_cat_varint(aTHX_ &enc->buf, out_tag, SvIV(ofs_sv));
1560 return;
1561 } else if (SvUOK(ofs_sv)) {
1562 srl_buf_cat_varint(aTHX_ &enc->buf, out_tag, SvUV(ofs_sv));
1563 return;
1564 } else {
1565 /* start tracking this string */
1566 sv_setuv(ofs_sv, (UV)BODY_POS_OFS(&enc->buf));
1567 }
1568 }
1569 }
1570 srl_dump_pv(aTHX_ enc, str, len, SvUTF8(src));
1571 }
1572
1573 SRL_STATIC_INLINE void
srl_dump_pv(pTHX_ srl_encoder_t * enc,const char * src,STRLEN src_len,int is_utf8)1574 srl_dump_pv(pTHX_ srl_encoder_t *enc, const char* src, STRLEN src_len, int is_utf8)
1575 {
1576 BUF_SIZE_ASSERT(&enc->buf, 1 + SRL_MAX_VARINT_LENGTH + src_len); /* overallocate a bit sometimes */
1577 if (is_utf8) {
1578 srl_buf_cat_varint_nocheck(aTHX_ &enc->buf, SRL_HDR_STR_UTF8, src_len);
1579 } else if (src_len <= SRL_MASK_SHORT_BINARY_LEN) {
1580 srl_buf_cat_char_nocheck(&enc->buf, SRL_HDR_SHORT_BINARY_LOW | (char)src_len);
1581 } else {
1582 srl_buf_cat_varint_nocheck(aTHX_ &enc->buf, SRL_HDR_BINARY, src_len);
1583 }
1584 Copy(src, enc->buf.pos, src_len, char);
1585 enc->buf.pos += src_len;
1586 }
1587
1588 #ifdef HAS_HV_BACKREFS
1589 AV *
srl_hv_backreferences_p_safe(pTHX_ HV * hv)1590 srl_hv_backreferences_p_safe(pTHX_ HV *hv) {
1591 if (SvOOK(hv)) {
1592 struct xpvhv_aux * const iter = HvAUX(hv);
1593 return iter->xhv_backreferences;
1594 } else {
1595 return NULL;
1596 }
1597 }
1598 #endif
1599
1600 /* Dumps generic SVs and delegates
1601 * to more specialized functions for RVs, etc. */
1602 /* TODO decide when to use the IV, when to use the PV, and when
1603 * to use the NV slots of the SV.
1604 * Safest simple solution seems "prefer string" (fuck dualvars).
1605 * Potentially better but slower: If we would choose the string,
1606 * then try int-to-string (respective float-to-string) conversion
1607 * and strcmp. If same, then use int or float.
1608 */
1609 static void
srl_dump_sv(pTHX_ srl_encoder_t * enc,SV * src)1610 srl_dump_sv(pTHX_ srl_encoder_t *enc, SV *src)
1611 {
1612 UV refcount;
1613 svtype svt;
1614 MAGIC *mg;
1615 AV *backrefs;
1616 SV* refsv= NULL;
1617 SV* replacement= NULL;
1618 UV weakref_ofs= 0; /* preserved between loops */
1619 SSize_t ref_rewrite_pos= 0; /* preserved between loops - note SSize_t is a perl define */
1620 assert(src);
1621
1622 if (expect_false( ++enc->recursion_depth == enc->max_recursion_depth )) {
1623 croak("Hit maximum recursion depth (%"UVuf"), aborting serialization",
1624 (UV)enc->max_recursion_depth);
1625 }
1626
1627 redo_dump:
1628 mg= NULL;
1629 backrefs= NULL;
1630 svt = SvTYPE(src);
1631 refcount = SvREFCNT(src);
1632 DEBUG_ASSERT_BUF_SANE(&enc->buf);
1633 if ( SvMAGICAL(src) ) {
1634 SvGETMAGIC(src);
1635 #ifdef HAS_HV_BACKREFS
1636 if (svt != SVt_PVHV)
1637 #endif
1638 mg = mg_find(src, PERL_MAGIC_backref);
1639 }
1640 #ifdef HAS_HV_BACKREFS
1641 if (expect_false( svt == SVt_PVHV && SvOOK(src) )) {
1642 backrefs= srl_hv_backreferences_p_safe(aTHX_ (HV *)src);
1643 if (DEBUGHACK) warn("backreferences %p", src);
1644 }
1645 #endif
1646 if (expect_false( mg || backrefs )) {
1647 PTABLE_t *weak_seenhash= SRL_GET_WEAK_SEENHASH(enc);
1648 PTABLE_ENTRY_t *pe= PTABLE_find(weak_seenhash, src);
1649 if (!pe) {
1650 /* not seen it before */
1651 if (DEBUGHACK) warn("scalar %p - is weak referent, storing %"UVuf, src, weakref_ofs);
1652 /* if weakref_ofs is false we got here some way that holds a refcount on this item */
1653 PTABLE_store(weak_seenhash, src, INT2PTR(void *, weakref_ofs));
1654 } else {
1655 if (DEBUGHACK) warn("scalar %p - is weak referent, seen before value:%"UVuf" weakref_ofs:%"UVuf,
1656 src, (UV)pe->value, (UV)weakref_ofs);
1657 if (pe->value)
1658 pe->value= INT2PTR(void *, weakref_ofs);
1659 }
1660 refcount++;
1661 weakref_ofs= 0;
1662 }
1663
1664 /* check if we have seen this scalar before, and track it so
1665 * if we see it again we recognize it */
1666 if ( expect_false( refcount > 1 ) ) {
1667 if (src == &PL_sv_undef && enc->protocol_version >=3 ) {
1668 srl_buf_cat_char(&enc->buf, SRL_HDR_CANONICAL_UNDEF);
1669 --enc->recursion_depth;
1670 return;
1671 }
1672 else
1673 if (src == &PL_sv_yes) {
1674 srl_buf_cat_char(&enc->buf, SRL_HDR_TRUE);
1675 --enc->recursion_depth;
1676 return;
1677 }
1678 else
1679 if (src == &PL_sv_no) {
1680 srl_buf_cat_char(&enc->buf, SRL_HDR_FALSE);
1681 --enc->recursion_depth;
1682 return;
1683 }
1684 else {
1685 PTABLE_t *ref_seenhash= SRL_GET_REF_SEENHASH(enc);
1686 const ptrdiff_t oldoffset = (ptrdiff_t)PTABLE_fetch(ref_seenhash, src);
1687 if (expect_false(oldoffset)) {
1688 /* we have seen it before, so we do not need to bless it again */
1689 if (ref_rewrite_pos) {
1690 if (DEBUGHACK) warn("ref to %p as %"UVuf, src, (UV)oldoffset);
1691 enc->buf.pos= enc->buf.body_pos + ref_rewrite_pos;
1692 srl_buf_cat_varint(aTHX_ &enc->buf, SRL_HDR_REFP, (UV)oldoffset);
1693 } else {
1694 if (DEBUGHACK) warn("alias to %p as %"UVuf, src, (UV)oldoffset);
1695 srl_buf_cat_varint(aTHX_ &enc->buf, SRL_HDR_ALIAS, (UV)oldoffset);
1696 }
1697 SRL_SET_TRACK_FLAG(*(enc->buf.body_pos + oldoffset));
1698 --enc->recursion_depth;
1699 return;
1700 }
1701 if (DEBUGHACK) warn("storing %p as %"UVuf, src, (UV)BODY_POS_OFS(&enc->buf));
1702 PTABLE_store(ref_seenhash, src, INT2PTR(void *, BODY_POS_OFS(&enc->buf)));
1703 }
1704 }
1705
1706 if (expect_false( weakref_ofs != 0 )) {
1707 sv_dump(src);
1708 croak("Corrupted weakref? weakref_ofs should be 0, but got %"UVuf" (this should not happen)", weakref_ofs);
1709 }
1710
1711 if (replacement) {
1712 if (SvROK(replacement)) {
1713 src= SvRV(replacement);
1714 } else {
1715 src= replacement;
1716 }
1717 replacement= NULL;
1718 svt = SvTYPE(src);
1719 /* plus one ensures that later on we get REFN/ARRAY and not ARRAYREF - This is horrible tho. needs to be revisited another day */
1720 refcount= SvREFCNT(src) + 1;
1721 /* We could, but do not do the following:*/
1722 /* goto redo_dump; */
1723 /* Probably a "proper" solution would, but there are nits there that I dont want to chase right now. */
1724 }
1725
1726 /* --------------------------------- */
1727 _SRL_IF_SIMPLE_DIRECT_DUMP_SV(enc, src, svt)
1728 else
1729 #if defined(MODERN_REGEXP) && defined(REGEXP_NO_LONGER_POK)
1730 /* Only need to enter here if we have rather modern regexps AND they're
1731 * NO LONGER POK (5.17.6 and up). */
1732 if ( expect_false( svt == SVt_REGEXP ) ) {
1733 srl_dump_regexp(aTHX_ enc, src);
1734 }
1735 else
1736 #endif
1737 if (SvROK(src)) {
1738 /* dump references */
1739 SV *referent= SvRV(src);
1740 /* assert()-like hack to be compiled out by default */
1741 #ifndef NDEBUG
1742 if (!referent) {
1743 sv_dump(src);
1744 assert(referent);
1745 }
1746 #endif
1747 if (expect_false( SvWEAKREF(src) )) {
1748 if (DEBUGHACK) warn("Is weakref %p", src);
1749 weakref_ofs= BODY_POS_OFS(&enc->buf);
1750 srl_buf_cat_char(&enc->buf, SRL_HDR_WEAKEN);
1751 }
1752
1753 ref_rewrite_pos= BODY_POS_OFS(&enc->buf);
1754
1755 if ( expect_false( sv_isobject(src) ) ) {
1756 /* Write bless operator with class name */
1757 replacement= srl_get_frozen_object(aTHX_ enc, src, referent);
1758 if (srl_dump_classname(aTHX_ enc, referent, replacement)) {
1759 /* 1 means we should not rewrite away the classname */
1760 ref_rewrite_pos= BODY_POS_OFS(&enc->buf);
1761 }
1762 }
1763
1764 srl_buf_cat_char(&enc->buf, SRL_HDR_REFN);
1765 refsv= src;
1766 src= referent;
1767
1768 if (DEBUGHACK) warn("Going to redo %p", src);
1769 goto redo_dump;
1770 }
1771 else
1772 #ifndef MODERN_REGEXP
1773 if (
1774 svt == SVt_PVMG &&
1775 ((SvFLAGS(src) & (SVs_OBJECT|SVf_OK|SVs_GMG|SVs_SMG|SVs_RMG)) == (SVs_OBJECT|BFD_Svs_SMG_OR_RMG)) &&
1776 (mg = mg_find(src, PERL_MAGIC_qr))
1777 ) {
1778 /* Houston, we have a regex! */
1779 srl_dump_regexp(aTHX_ enc, (SV*)mg); /* yes the SV* cast makes me feel dirty too */
1780 }
1781 else
1782 #endif
1783 if (svt == SVt_PVHV) {
1784 srl_dump_hv(aTHX_ enc, (HV *)src, refcount);
1785 }
1786 else
1787 if (svt == SVt_PVAV) {
1788 srl_dump_av(aTHX_ enc, (AV *)src, refcount);
1789 }
1790 else
1791 if ( ! SvOK(src) ) { /* undef and weird shit */
1792 if ( SRL_UNSUPPORTED_SvTYPE(svt) ) {
1793 /* we exclude magic, because magic sv's can be undef too */
1794 /* called when we find an unsupported type/reference. May either throw exception
1795 * or write ONE (nested or single) item to the buffer. */
1796 #define SRL_HANDLE_UNSUPPORTED_SvTYPE(enc, src, svt, refsv, ref_rewrite_pos) \
1797 STMT_START { \
1798 if ( SRL_ENC_HAVE_OPTION((enc), SRL_F_UNDEF_UNKNOWN) ) { \
1799 if (SRL_ENC_HAVE_OPTION((enc), SRL_F_WARN_UNKNOWN)) \
1800 warn("Found type %u %s(0x%p), but it is not representable " \
1801 "by the Sereal encoding format; will encode as an " \
1802 "undefined value", (svt), sv_reftype((src),0),(src)); \
1803 if (ref_rewrite_pos) { \
1804 /* make sure we don't keep a reference to the thing that we do not \
1805 * want to serialize around for REFP and ALIAS output */ \
1806 PTABLE_t *ref_seenhash= SRL_GET_REF_SEENHASH(enc); \
1807 PTABLE_delete(ref_seenhash, src); \
1808 enc->buf.pos= enc->buf.body_pos + ref_rewrite_pos; \
1809 } \
1810 srl_buf_cat_char(&(enc)->buf, SRL_HDR_UNDEF); \
1811 } \
1812 else if ( SRL_ENC_HAVE_OPTION((enc), SRL_F_STRINGIFY_UNKNOWN) ) { \
1813 STRLEN len; \
1814 char *str; \
1815 if (SRL_ENC_HAVE_OPTION((enc), SRL_F_WARN_UNKNOWN)) { \
1816 /* In theory, we need to warn about stringifying this unsupported \
1817 * item. However, if the SRL_F_NOWARN_UNKNOWN_OVERLOAD option is set, \
1818 * then we DO NOT warn about stringifying this unsupported item if \
1819 * it is an object with string overloading (assuming it's done on \
1820 * purpose to stringify in cases like these). \
1821 */ \
1822 if (!SRL_ENC_HAVE_OPTION((enc), SRL_F_NOWARN_UNKNOWN_OVERLOAD) \
1823 || !SvOBJECT(src) \
1824 || !Gv_AMG(SvSTASH(src))) \
1825 { \
1826 warn("Found type %u %s(0x%p), but it is not representable " \
1827 "by the Sereal encoding format; will encode as a " \
1828 "stringified form", (svt), sv_reftype((src),0),(src)); \
1829 } \
1830 } \
1831 if (ref_rewrite_pos) { \
1832 /* make sure we don't keep a reference to the thing that we do not \
1833 * want to serialize around for REFP and ALIAS output */ \
1834 PTABLE_t *ref_seenhash= SRL_GET_REF_SEENHASH(enc); \
1835 PTABLE_delete(ref_seenhash, src); \
1836 enc->buf.pos= enc->buf.body_pos + ref_rewrite_pos; \
1837 str = SvPV((refsv), len); \
1838 } else \
1839 str = SvPV((src), len); \
1840 srl_dump_pv(aTHX_ (enc), (str), len, SvUTF8(src)); \
1841 } \
1842 else { \
1843 croak("Found type %u %s(0x%p), but it is not representable " \
1844 "by the Sereal encoding format", (svt), sv_reftype((src),0),(src)); \
1845 } \
1846 } STMT_END
1847 SRL_HANDLE_UNSUPPORTED_SvTYPE(enc, src, svt, refsv, ref_rewrite_pos);
1848 }
1849 else if (src == &PL_sv_undef && enc->protocol_version >= 3 ) {
1850 srl_buf_cat_char(&enc->buf, SRL_HDR_CANONICAL_UNDEF);
1851 } else {
1852 srl_buf_cat_char(&enc->buf, SRL_HDR_UNDEF);
1853 }
1854 }
1855 else {
1856 SRL_HANDLE_UNSUPPORTED_SvTYPE(enc, src, svt, refsv, ref_rewrite_pos);
1857 #undef SRL_HANDLE_UNSUPPORTED_SvTYPE
1858 }
1859 --enc->recursion_depth;
1860 }
1861
1862