1 /* Must be defined before including Perl header files or we slow down by 2x! */
2 #define PERL_NO_GET_CONTEXT
3 
4 #ifdef __cplusplus
5 extern "C" {
6 #endif
7 #include "EXTERN.h"
8 #include "perl.h"
9 #include "XSUB.h"
10 #include "ppport.h"
11 #ifdef __cplusplus
12 }
13 #endif
14 
15 #include <stdlib.h>
16 
17 #ifndef PERL_VERSION
18 #    include <patchlevel.h>
19 #    if !(defined(PERL_VERSION) || (PERL_SUBVERSION > 0 && defined(PATCHLEVEL)))
20 #        include <could_not_find_Perl_patchlevel.h>
21 #    endif
22 #    define PERL_REVISION       5
23 #    define PERL_VERSION        PATCHLEVEL
24 #    define PERL_SUBVERSION     PERL_SUBVERSION
25 #endif
26 #if PERL_VERSION < 8
27 #   define PERL_MAGIC_qr                  'r' /* precompiled qr// regex */
28 #   define BFD_Svs_SMG_OR_RMG SVs_RMG
29 #elif ((PERL_VERSION==8) && (PERL_SUBVERSION >= 1) || (PERL_VERSION>8))
30 #   define BFD_Svs_SMG_OR_RMG SVs_SMG
31 #   define MY_PLACEHOLDER PL_sv_placeholder
32 #else
33 #   define BFD_Svs_SMG_OR_RMG SVs_RMG
34 #   define MY_PLACEHOLDER PL_sv_undef
35 #endif
36 #if (((PERL_VERSION == 9) && (PERL_SUBVERSION >= 4)) || (PERL_VERSION > 9))
37 #   define NEW_REGEX_ENGINE 1
38 #endif
39 #if (((PERL_VERSION == 8) && (PERL_SUBVERSION >= 1)) || (PERL_VERSION > 8))
40 #define MY_CAN_FIND_PLACEHOLDERS
41 #define HAS_SV2OBJ
42 #endif
43 
44 /* hv_backreferences_p is not marked as exported in embed.fnc in any perl */
45 #if (PERL_VERSION >= 10)
46 #define HAS_HV_BACKREFS
47 #endif
48 
49 #include "srl_protocol.h"
50 #include "srl_encoder.h"
51 #include "srl_common.h"
52 #include "ptable.h"
53 #include "srl_buffer.h"
54 #include "srl_compress.h"
55 #include "qsort.h"
56 
57 /* The ENABLE_DANGEROUS_HACKS (passed through from ENV via Makefile.PL) enables
58  * optimizations that may make the code so cozy with a particular version of the
59  * Perl core that the code is no longer portable and/or compatible.
60  * It would be great to determine where these hacks are safe and enable them
61  * where possible. Gut feeling as for portability is that most things will be
62  * ok on Unixes, but fail on the stricter Win32. As for compatibility with old
63  * versions of perl, all bets are off.
64  */
65 #ifdef ENABLE_DANGEROUS_HACKS
66     /* It's unclear why DO_SHARED_HASH_ENTRY_REFCOUNT_CHECK doesn't
67      * help much. It basically means breaking perl's encapsulation to
68      * check whether a HE (hash entry) that is shared has a refcount > 1
69      * and only bothers inserting key into our ptr table if that's the
70      * case. Benchmarks don't show much of a difference and it's a high
71      * price to pay to break encapsulation for something that's not
72      * measureable.
73      */
74     /* DO_SHARED_HASH_ENTRY_REFCOUNT_CHECK only works on 5.10 and better */
75 #   define DO_SHARED_HASH_ENTRY_REFCOUNT_CHECK 1
76 #else
77 #   define DO_SHARED_HASH_ENTRY_REFCOUNT_CHECK 0
78 #endif
79 
80 #define DEFAULT_MAX_RECUR_DEPTH 10000
81 
82 #define DEBUGHACK 0
83 
84 /* some static function declarations */
85 SRL_STATIC_INLINE void srl_clear_seen_hashes(pTHX_ srl_encoder_t *enc);
86 static void srl_dump_sv(pTHX_ srl_encoder_t *enc, SV *src);
87 SRL_STATIC_INLINE void srl_dump_svpv(pTHX_ srl_encoder_t *enc, SV *src);
88 SRL_STATIC_INLINE void srl_dump_pv(pTHX_ srl_encoder_t *enc, const char* src, STRLEN src_len, int is_utf8);
89 SRL_STATIC_INLINE void srl_fixup_weakrefs(pTHX_ srl_encoder_t *enc);
90 SRL_STATIC_INLINE void srl_dump_av(pTHX_ srl_encoder_t *enc, AV *src, U32 refcnt);
91 SRL_STATIC_INLINE void srl_dump_hv(pTHX_ srl_encoder_t *enc, HV *src, U32 refcnt);
92 SRL_STATIC_INLINE void srl_dump_hk(pTHX_ srl_encoder_t *enc, HE *src, const int share_keys);
93 SRL_STATIC_INLINE void srl_dump_nv(pTHX_ srl_encoder_t *enc, SV *src);
94 SRL_STATIC_INLINE void srl_dump_ivuv(pTHX_ srl_encoder_t *enc, SV *src);
95 SRL_STATIC_INLINE int srl_dump_classname(pTHX_ srl_encoder_t *enc, SV *referent, SV *replacement);
96 SRL_STATIC_INLINE SV *srl_get_frozen_object(pTHX_ srl_encoder_t *enc, SV *src, SV *referent);
97 SRL_STATIC_INLINE PTABLE_t *srl_init_string_hash(srl_encoder_t *enc);
98 SRL_STATIC_INLINE PTABLE_t *srl_init_ref_hash(srl_encoder_t *enc);
99 SRL_STATIC_INLINE PTABLE_t *srl_init_freezeobj_svhash(srl_encoder_t *enc);
100 SRL_STATIC_INLINE PTABLE_t *srl_init_weak_hash(srl_encoder_t *enc);
101 SRL_STATIC_INLINE HV *srl_init_string_deduper_hv(pTHX_ srl_encoder_t *enc);
102 
103 /* Note: This returns an encoder struct pointer because it will
104  *       clone the current encoder struct if it's dirty. That in
105  *       turn means in order to access the output buffer, you need
106  *       to inspect the returned encoder struct. If necessary, it
107  *       will be cleaned up automatically by Perl, so don't bother
108  *       freeing it. */
109 SRL_STATIC_INLINE srl_encoder_t *srl_dump_data_structure(pTHX_ srl_encoder_t *enc, SV *src, SV *user_header_src);
110 
111 #define SRL_GET_STR_DEDUPER_HV(enc) ( (enc)->string_deduper_hv == NULL          \
112                                     ? srl_init_string_deduper_hv(aTHX_ enc)     \
113                                    : (enc)->string_deduper_hv )
114 
115 #define SRL_GET_STR_PTR_SEENHASH(enc) ( (enc)->str_seenhash == NULL     \
116                                     ? srl_init_string_hash(enc)         \
117                                    : (enc)->str_seenhash )
118 
119 #define SRL_GET_REF_SEENHASH(enc) ( (enc)->ref_seenhash == NULL     \
120                                     ? srl_init_ref_hash(enc)        \
121                                    : (enc)->ref_seenhash )
122 
123 #define SRL_GET_WEAK_SEENHASH(enc) ( (enc)->weak_seenhash == NULL   \
124                                     ? srl_init_weak_hash(enc)       \
125                                    : (enc)->weak_seenhash )
126 
127 #define SRL_GET_WEAK_SEENHASH_OR_NULL(enc) ((enc)->weak_seenhash)
128 
129 #define SRL_GET_FREEZEOBJ_SVHASH(enc) ( (enc)->freezeobj_svhash == NULL     \
130                                         ? srl_init_freezeobj_svhash(enc)    \
131                                         : (enc)->freezeobj_svhash )
132 
133 #define SRL_ENC_UPDATE_BODY_POS(enc) SRL_UPDATE_BODY_POS(&(enc)->buf, (enc)->protocol_version)
134 
135 #ifndef MAX_CHARSET_NAME_LENGTH
136 #    define MAX_CHARSET_NAME_LENGTH 2
137 #endif
138 
139 #if PERL_VERSION == 10
140 /*
141 	Apparently regexes in 5.10 are "modern" but with 5.8 internals
142 */
143 #ifndef RXf_PMf_STD_PMMOD_SHIFT
144 #    define RXf_PMf_STD_PMMOD_SHIFT 12
145 #endif
146 #ifndef RE_EXTFLAGS
147 #    define RX_EXTFLAGS(re)	((re)->extflags)
148 #endif
149 #ifndef RX_PRECOMP
150 #    define RX_PRECOMP(re) ((re)->precomp)
151 #endif
152 #ifndef RX_PRELEN
153 #    define RX_PRELEN(re) ((re)->prelen)
154 #endif
155 
156 /* Maybe this is only on OS X, where SvUTF8(sv) exists but looks at flags that don't exist */
157 #ifndef RX_UTF8
158 #    define RX_UTF8(re) (RX_EXTFLAGS(re) & RXf_UTF8)
159 #endif
160 
161 #elif defined(SvRX)
162 #    define MODERN_REGEXP
163 #    if ( PERL_VERSION > 27 || (PERL_VERSION == 27 && PERL_SUBVERSION >= 3) )
164      /* Commit df6b4bd56551f2d39f7c0019c23f27181d8c39c4
165       * changed the behavior mentioned below, so that the POK flag is on again. Sigh.
166       * So this branch is a deliberate NO-OP, it just makes the conditions easier to read.*/
167 #    elif ( PERL_VERSION > 17 || (PERL_VERSION == 17 && PERL_SUBVERSION >= 6) )
168      /* With commit 8d919b0a35f2b57a6bed2f8355b25b19ac5ad0c5 (perl.git) and
169       * release 5.17.6, regular expression are no longer SvPOK (IOW are no longer
170       * considered to be containing a string).
171       * This breaks some of the REGEXP detection logic in srl_dump_sv, so
172       * we need yet another CPP define. */
173 #        define REGEXP_NO_LONGER_POK
174 #    endif
175 #else
176 #    define INT_PAT_MODS "msix"
177 #    define RXf_PMf_STD_PMMOD_SHIFT 12
178 #    define RX_PRECOMP(re) ((re)->precomp)
179 #    define RX_PRELEN(re) ((re)->prelen)
180 #    define RX_UTF8(re) ((re)->reganch & ROPT_UTF8)
181 #    define RX_EXTFLAGS(re) ((re)->reganch)
182 #    define RXf_PMf_COMPILETIME  PMf_COMPILETIME
183 #endif
184 
185 #if defined(MODERN_REGEXP) && !defined(REGEXP_NO_LONGER_POK)
186 #define DO_POK_REGEXP(enc, src, svt)                                    \
187         /* Only need to enter here if we have rather modern regexps,*/  \
188         /* but they're still POK (pre 5.17.6). */                       \
189         if (expect_false( svt == SVt_REGEXP ) ) {                       \
190             srl_dump_regexp(aTHX_ enc, src);                            \
191         }                                                               \
192         else
193 #else
194 #define DO_POK_REGEXP(enc, src, svt) /*no-op*/
195 #endif
196 
197 #define _SRL_IF_SIMPLE_DIRECT_DUMP_SV(enc, src, svt)                        \
198     if (SvPOK(src)) {                                                       \
199         STRLEN L;                                                           \
200         char *PV= SvPV(src, L);                                             \
201         if ( SvIOK(src) ) {                                                 \
202             if ( SvIV(src) == 0 ) {                                         \
203                 if ( L == 1 && PV[0] == '0' ) {                             \
204                     /* its a true 0 */                                      \
205                     srl_buf_cat_char(&enc->buf, SRL_HDR_POS + 0);           \
206                 }                                                           \
207                 else {                                                      \
208                     /* must be a string */                                  \
209                     srl_dump_svpv(aTHX_ enc, src);                          \
210                 }                                                           \
211             }                                                               \
212             else                                                            \
213             if (                                                            \
214                 !L ||                                                       \
215                 !isDIGIT(PV[L-1]) ||                                        \
216                 (                                                           \
217                  SvIV(src) > 0                                              \
218                  ? ( PV[0] == '0' || !isDIGIT(PV[0]) )                      \
219                  : ( L < 2 || PV[0] != '-' || PV[1] == '0' || !isDIGIT(PV[1]) ) \
220                 )                                                           \
221             ) {                                                             \
222                 srl_dump_svpv(aTHX_ enc, src);                              \
223             }                                                               \
224             else {                                                          \
225                 if ( SvNOK(src) ) {                                         \
226                     /* fallback to checking if the canonical stringified*/  \
227                     /* int is the same as the buffer */                     \
228                     sv_setiv(enc->scratch_sv,SvIV(src));                    \
229                     if ( sv_cmp(enc->scratch_sv,src) ) {                    \
230                         srl_dump_svpv(aTHX_ enc, src);                      \
231                     } else {                                                \
232                         srl_dump_ivuv(aTHX_ enc, src);                      \
233                     }                                                       \
234                 } else {                                                    \
235                     srl_dump_ivuv(aTHX_ enc, src);                          \
236                 }                                                           \
237             }                                                               \
238         }                                                                   \
239         else                                                                \
240         if ( SvNOK(src) ) {                                                 \
241             if ( L <= 8 ||                                                  \
242                 !isDIGIT(PV[0]) ||                                          \
243                 !isDIGIT(PV[L-1]) ||                                        \
244                 PV[L-1] == '0' ||                                           \
245                  (                                                          \
246                   SvNV(src) > 0.0                                           \
247                   ? ( PV[0] == '.' || (PV[0] == '0' && PV[1] != '.') )      \
248                   : ( PV[0] != '-' || PV[1] == '.' || (PV[1] == '0' && PV[2] != '.')) \
249                 )                                                           \
250             ) {                                                             \
251                 srl_dump_svpv(aTHX_ enc, src);                              \
252             }                                                               \
253             else {                                                          \
254                 srl_dump_nv(aTHX_ enc, src);                                \
255             }                                                               \
256         }                                                                   \
257         else {                                                              \
258             DO_POK_REGEXP(enc,src,svt)                                      \
259             srl_dump_svpv(aTHX_ enc, src);                                  \
260         }                                                                   \
261     }                                                                       \
262     else                                                                    \
263     if ( SvIOK(src) ) {                                                     \
264         srl_dump_ivuv(aTHX_ enc, src);                                  \
265     }                                                                   \
266     else                                                                \
267     /* if its a float then its a float */                               \
268     if (SvNOK(src)) {                                                   \
269         srl_dump_nv(aTHX_ enc, src);                                    \
270     }                                                                   \
271     else                                                                \
272     /* The POKp, IOKp, NOKp checks below deal with PVLV */              \
273     /* if its POK or POKp, then we treat it as a string */              \
274     if (SvPOKp(src)) {                                                  \
275         DO_POK_REGEXP(enc,src,svt)                                      \
276         srl_dump_svpv(aTHX_ enc, src);                                  \
277     }                                                                   \
278     else                                                                \
279     /* if its IOKp then we treat it as an int */                        \
280     if (SvIOKp(src)) {                                                  \
281         srl_dump_ivuv(aTHX_ enc, src);                                  \
282     }                                                                   \
283     else                                                                \
284     /* if its NOKp then we treat it as an nv */                         \
285     if (SvNOKp(src)) {                                                  \
286         srl_dump_nv(aTHX_ enc, src);                                    \
287     }                                                                   \
288 
289 #define CALL_SRL_DUMP_SV(enc, src) STMT_START {                                  \
290     if (!(src)) {                                                                   \
291         srl_buf_cat_char(&(enc)->buf, SRL_HDR_CANONICAL_UNDEF); /* is this right? */\
292     }                                                                               \
293     else                                                                            \
294     {                                                                               \
295 	svtype svt;								    \
296 	SvGETMAGIC(src);							    \
297 	svt= SvTYPE((src));							    \
298         if (svt < SVt_PVMG &&                                                       \
299             SvREFCNT((src)) == 1 &&                                                 \
300             !SvROK((src))                                                           \
301         ) {                                                                         \
302             _SRL_IF_SIMPLE_DIRECT_DUMP_SV(enc, src, svt)                            \
303             else {                                                                  \
304                 srl_dump_sv(aTHX_ (enc), (src));                                    \
305             }                                                                       \
306         } else {                                                                    \
307             srl_dump_sv(aTHX_ (enc), (src));                                        \
308         }                                                                           \
309     }                                                                               \
310 } STMT_END
311 
312 #define CALL_SRL_DUMP_SVP(enc, srcp) STMT_START {                                   \
313     if (!(srcp)) {                                                                  \
314         srl_buf_cat_char(&(enc)->buf, SRL_HDR_CANONICAL_UNDEF); /* is this right? */\
315     } else {                                                                        \
316         SV *src= *srcp;                                                             \
317         CALL_SRL_DUMP_SV(enc,src);                                                  \
318     }                                                                               \
319 } STMT_END
320 
321 /* This is fired when we exit the Perl pseudo-block.
322  * It frees our encoder and all. Put encoder-level cleanup
323  * logic here so that we can simply use croak/longjmp for
324  * exception handling. Makes life vastly easier!
325  */
326 void
srl_destructor_hook(pTHX_ void * p)327 srl_destructor_hook(pTHX_ void *p)
328 {
329     srl_encoder_t *enc = (srl_encoder_t *)p;
330     /* Do not auto-destroy encoder if set to be re-used */
331     if (!SRL_ENC_HAVE_OPTION(enc, SRL_F_REUSE_ENCODER)) {
332         /* Exception cleanup. Under normal operation, we should have
333          * assigned NULL to buf_start after we're done. */
334         srl_destroy_encoder(aTHX_ enc);
335     }
336     else {
337         srl_clear_encoder(aTHX_ enc);
338     }
339 }
340 
341 SRL_STATIC_INLINE void
srl_clear_seen_hashes(pTHX_ srl_encoder_t * enc)342 srl_clear_seen_hashes(pTHX_ srl_encoder_t *enc)
343 {
344     if (enc->ref_seenhash != NULL)
345         PTABLE_clear(enc->ref_seenhash);
346     if (enc->freezeobj_svhash != NULL)
347         PTABLE_clear_dec(aTHX_ enc->freezeobj_svhash);
348     if (enc->str_seenhash != NULL)
349         PTABLE_clear(enc->str_seenhash);
350     if (enc->weak_seenhash != NULL)
351         PTABLE_clear(enc->weak_seenhash);
352     if (enc->string_deduper_hv != NULL)
353         hv_clear(enc->string_deduper_hv);
354 }
355 
356 void
srl_clear_encoder(pTHX_ srl_encoder_t * enc)357 srl_clear_encoder(pTHX_ srl_encoder_t *enc)
358 {
359     /* TODO I think this could just be made an assert. */
360     if (!SRL_ENC_HAVE_OPER_FLAG(enc, SRL_OF_ENCODER_DIRTY)) {
361         warn("Sereal Encoder being cleared but in virgin state. That is unexpected.");
362     }
363 
364     enc->recursion_depth = 0;
365     srl_clear_seen_hashes(aTHX_ enc);
366 
367     enc->buf.pos = enc->buf.start;
368     /* tmp_buf.start may be NULL for an unused tmp_buf, but so what? */
369     enc->tmp_buf.pos = enc->tmp_buf.start;
370 
371     SRL_SET_BODY_POS(&enc->buf, enc->buf.start);
372 
373     SRL_ENC_RESET_OPER_FLAG(enc, SRL_OF_ENCODER_DIRTY);
374 }
375 
376 void
srl_destroy_encoder(pTHX_ srl_encoder_t * enc)377 srl_destroy_encoder(pTHX_ srl_encoder_t *enc)
378 {
379     srl_buf_free_buffer(aTHX_ &enc->buf);
380 
381     /* Free tmp buffer only if it was allocated at all. */
382     if (enc->tmp_buf.start != NULL)
383         srl_buf_free_buffer(aTHX_ &enc->tmp_buf);
384 
385     srl_destroy_snappy_workmem(aTHX_ enc->snappy_workmem);
386 
387     if (enc->ref_seenhash != NULL)
388         PTABLE_free(enc->ref_seenhash);
389     if (enc->freezeobj_svhash != NULL)
390         PTABLE_free(enc->freezeobj_svhash);
391     if (enc->str_seenhash != NULL)
392         PTABLE_free(enc->str_seenhash);
393     if (enc->weak_seenhash != NULL)
394         PTABLE_free(enc->weak_seenhash);
395     if (enc->string_deduper_hv != NULL)
396         SvREFCNT_dec(enc->string_deduper_hv);
397 
398     SvREFCNT_dec(enc->sereal_string_sv);
399     SvREFCNT_dec(enc->scratch_sv);
400 
401     Safefree(enc);
402 }
403 
404 /* allocate an empty encoder struct - flags still to be set up */
405 SRL_STATIC_INLINE srl_encoder_t *
srl_empty_encoder_struct(pTHX)406 srl_empty_encoder_struct(pTHX)
407 {
408     srl_encoder_t *enc;
409     Newxz(enc, 1, srl_encoder_t);
410     if (enc == NULL)
411         croak("Out of memory");
412 
413     /* Init buffer struct */
414     if (expect_false( srl_buf_init_buffer(aTHX_ &(enc->buf), INITIALIZATION_SIZE) != 0 )) {
415         Safefree(enc);
416         croak("Out of memory");
417     }
418 
419     enc->protocol_version = SRL_PROTOCOL_VERSION;
420     enc->max_recursion_depth = DEFAULT_MAX_RECUR_DEPTH;
421 
422     return enc;
423 }
424 
425 #define my_hv_fetchs(he,val,opt,idx) STMT_START {                   \
426     he = hv_fetch_ent(opt, options[idx].sv, 0, options[idx].hash);  \
427     if (he)                                                         \
428         val= HeVAL(he);                                             \
429     else                                                            \
430         val= NULL;                                                  \
431 } STMT_END
432 
433 /* Builds the C-level configuration and state struct. */
434 srl_encoder_t *
srl_build_encoder_struct(pTHX_ HV * opt,sv_with_hash * options)435 srl_build_encoder_struct(pTHX_ HV *opt, sv_with_hash *options)
436 {
437     srl_encoder_t *enc;
438     SV *val;
439     HE *he;
440 
441     enc = srl_empty_encoder_struct(aTHX);
442     enc->flags = 0;
443     enc->scratch_sv= newSViv(0);
444 
445     /* load options */
446     if (opt != NULL) {
447         int undef_unknown = 0;
448         int compression_format = 0;
449         /* SRL_F_SHARED_HASHKEYS on by default */
450         my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_NO_SHARED_HASHKEYS);
451         if ( !val || !SvTRUE(val) )
452             SRL_ENC_SET_OPTION(enc, SRL_F_SHARED_HASHKEYS);
453 
454         /* Needs to be before the snappy options */
455         /* enc->protocol_version defaults to SRL_PROTOCOL_VERSION. */
456         my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_PROTOCOL_VERSION);
457         if (val && SvOK(val)) {
458             enc->protocol_version = SvUV(val);
459             if (enc->protocol_version < 1
460                 || enc->protocol_version > SRL_PROTOCOL_VERSION)
461             {
462                 croak("Specified Sereal protocol version (%"UVuf") is invalid",
463                       (UV)enc->protocol_version);
464             }
465         }
466         else {
467             /* Compatibility with the old way to specify older protocol version */
468             my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_USE_PROTOCOL_V1);
469             if ( val && SvTRUE(val) )
470                 enc->protocol_version = 1;
471         }
472 
473         my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_CROAK_ON_BLESS);
474         if ( val && SvTRUE(val) )
475             SRL_ENC_SET_OPTION(enc, SRL_F_CROAK_ON_BLESS);
476 
477         my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_NO_BLESS_OBJECTS);
478         if ( val && SvTRUE(val) )
479             SRL_ENC_SET_OPTION(enc, SRL_F_NO_BLESS_OBJECTS);
480 
481         my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_FREEZE_CALLBACKS);
482         if ( val && SvTRUE(val) ) {
483             if (SRL_ENC_HAVE_OPTION(enc, SRL_F_NO_BLESS_OBJECTS))
484                 croak("The no_bless_objects and freeze_callback_support "
485                       "options are mutually exclusive");
486             SRL_ENC_SET_OPTION(enc, SRL_F_ENABLE_FREEZE_SUPPORT);
487             enc->sereal_string_sv = newSVpvs("Sereal");
488         }
489 
490         my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_COMPRESS);
491         if (val) {
492             compression_format = SvIV(val);
493 
494             /* See also Encoder.pm's constants */
495             switch (compression_format) {
496             case 0: /* uncompressed */
497                 break;
498             case 1:
499                 SRL_ENC_SET_OPTION(enc, SRL_F_COMPRESS_SNAPPY_INCREMENTAL);
500                 break;
501             case 2:
502                 SRL_ENC_SET_OPTION(enc, SRL_F_COMPRESS_ZLIB);
503                 if (enc->protocol_version < 3)
504                     croak("Zlib compression was introduced in protocol version 3 and you are asking for only version %i", (int)enc->protocol_version);
505 
506                 enc->compress_level = MZ_DEFAULT_COMPRESSION;
507                 my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_COMPRESS_LEVEL);
508                 if ( val && SvTRUE(val) ) {
509                     IV lvl = SvIV(val);
510                     if (expect_false( lvl < 1 || lvl > 10 )) /* Sekrit: compression lvl 10 is a miniz thing that doesn't exist in normal zlib */
511                         croak("'compress_level' needs to be between 1 and 9");
512                     enc->compress_level = lvl;
513                 }
514                 break;
515             case 3:
516                 SRL_ENC_SET_OPTION(enc, SRL_F_COMPRESS_ZSTD);
517                 if (enc->protocol_version < 3)
518                     croak("zstd compression was introduced in protocol version 3 and you are asking for only version %i", (int)enc->protocol_version);
519 
520                 enc->compress_level = 3; /* default compression level */
521                 my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_COMPRESS_LEVEL);
522                 if ( val && SvTRUE(val) ) {
523                     IV lvl = SvIV(val);
524                     if (expect_false( lvl < 1 || lvl > 22 )) /* TODO: ZSTD_maxCLevel() */
525                         croak("'compress_level' needs to be between 1 and 22");
526                     enc->compress_level = lvl;
527                 }
528                 break;
529             default:
530                 croak("Invalid Sereal compression format");
531             }
532         }
533         else {
534             /* Only bother with old compression options if necessary */
535 
536             my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_SNAPPY_INCR);
537             if ( val && SvTRUE(val) ) {
538                 SRL_ENC_SET_OPTION(enc, SRL_F_COMPRESS_SNAPPY_INCREMENTAL);
539                 compression_format = 1;
540             }
541              else {
542                 /* snappy_incr >> snappy */
543                 my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_SNAPPY);
544                 if ( val && SvTRUE(val) ) {
545                     /* incremental is the new black in V2 */
546                     if (expect_true( enc->protocol_version > 1 ))
547                         SRL_ENC_SET_OPTION(enc, SRL_F_COMPRESS_SNAPPY_INCREMENTAL);
548                     else
549                         SRL_ENC_SET_OPTION(enc, SRL_F_COMPRESS_SNAPPY);
550                     compression_format = 1;
551                 }
552             }
553         }
554 
555         my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_UNDEF_UNKNOWN);
556         if ( val && SvTRUE(val) ) {
557             undef_unknown = 1;
558             SRL_ENC_SET_OPTION(enc, SRL_F_UNDEF_UNKNOWN);
559         }
560 
561         my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_SORT_KEYS);
562         if ( !val )
563             my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_CANONICAL);
564         if ( val && SvTRUE(val) ) {
565             SRL_ENC_SET_OPTION(enc, SRL_F_SORT_KEYS);
566             if (SvIV(val) > 1) {
567                 SRL_ENC_SET_OPTION(enc, SRL_F_SORT_KEYS_PERL);
568                 if (SvIV(val) > 2) {
569                     SRL_ENC_SET_OPTION(enc, SRL_F_SORT_KEYS_PERL_REV);
570                 }
571             }
572         }
573 
574         my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_CANONICAL_REFS);
575         if ( !val )
576             my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_CANONICAL);
577         if ( val && SvTRUE(val) )
578             SRL_ENC_SET_OPTION(enc, SRL_F_CANONICAL_REFS);
579 
580         my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_ALIASED_DEDUPE_STRINGS);
581         if ( val && SvTRUE(val) )
582             SRL_ENC_SET_OPTION(enc, SRL_F_ALIASED_DEDUPE_STRINGS | SRL_F_DEDUPE_STRINGS);
583         else {
584             my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_DEDUPE_STRINGS);
585             if ( val && SvTRUE(val) )
586                 SRL_ENC_SET_OPTION(enc, SRL_F_DEDUPE_STRINGS);
587         }
588 
589         my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_STRINGIFY_UNKNOWN);
590         if ( val && SvTRUE(val) ) {
591             if (expect_false( undef_unknown ))
592                 croak("'undef_unknown' and 'stringify_unknown' "
593                       "options are mutually exclusive");
594             SRL_ENC_SET_OPTION(enc, SRL_F_STRINGIFY_UNKNOWN);
595         }
596 
597         my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_WARN_UNKNOWN);
598         if ( val && SvTRUE(val) ) {
599             SRL_ENC_SET_OPTION(enc, SRL_F_WARN_UNKNOWN);
600             if (SvIV(val) < 0)
601                 SRL_ENC_SET_OPTION(enc, SRL_F_NOWARN_UNKNOWN_OVERLOAD);
602         }
603 
604         if (compression_format) {
605             enc->compress_threshold = 1024;
606             my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_COMPRESS_THRESHOLD);
607             if ( val && SvOK(val) )
608                 enc->compress_threshold = SvIV(val);
609             else if (compression_format == 1) {
610                 /* compression_format==1 is some sort of Snappy */
611                 my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_SNAPPY_THRESHOLD);
612                 if ( val && SvOK(val) )
613                     enc->compress_threshold = SvIV(val);
614             }
615         }
616 
617         my_hv_fetchs(he, val, opt, SRL_ENC_OPT_IDX_MAX_RECURSION_DEPTH);
618         if ( val && SvTRUE(val) )
619             enc->max_recursion_depth = SvUV(val);
620     }
621     else {
622         /* SRL_F_SHARED_HASHKEYS on by default */
623         SRL_ENC_SET_OPTION(enc, SRL_F_SHARED_HASHKEYS);
624     }
625 
626     DEBUG_ASSERT_BUF_SANE(&enc->buf);
627     return enc;
628 }
629 
630 /* clone an encoder without current state */
631 srl_encoder_t *
srl_build_encoder_struct_alike(pTHX_ srl_encoder_t * proto)632 srl_build_encoder_struct_alike(pTHX_ srl_encoder_t *proto)
633 {
634     srl_encoder_t *enc;
635     enc = srl_empty_encoder_struct(aTHX);
636 
637     /* Copy the configuration-type, non-ephemeral attributes */
638     enc->flags = proto->flags;
639     enc->max_recursion_depth = proto->max_recursion_depth;
640     enc->compress_threshold = proto->compress_threshold;
641     if (expect_false(SRL_ENC_HAVE_OPTION(enc, SRL_F_ENABLE_FREEZE_SUPPORT))) {
642         enc->sereal_string_sv = newSVpvs("Sereal");
643     }
644     enc->protocol_version = proto->protocol_version;
645     enc->scratch_sv= newSViv(0);
646     DEBUG_ASSERT_BUF_SANE(&enc->buf);
647     return enc;
648 }
649 
650 SRL_STATIC_INLINE PTABLE_t *
srl_init_string_hash(srl_encoder_t * enc)651 srl_init_string_hash(srl_encoder_t *enc)
652 {
653     enc->str_seenhash = PTABLE_new_size(4);
654     return enc->str_seenhash;
655 }
656 
657 SRL_STATIC_INLINE PTABLE_t *
srl_init_ref_hash(srl_encoder_t * enc)658 srl_init_ref_hash(srl_encoder_t *enc)
659 {
660     enc->ref_seenhash = PTABLE_new_size(4);
661     return enc->ref_seenhash;
662 }
663 
664 SRL_STATIC_INLINE PTABLE_t *
srl_init_weak_hash(srl_encoder_t * enc)665 srl_init_weak_hash(srl_encoder_t *enc)
666 {
667     enc->weak_seenhash = PTABLE_new_size(3);
668     return enc->weak_seenhash;
669 }
670 
671 SRL_STATIC_INLINE PTABLE_t *
srl_init_freezeobj_svhash(srl_encoder_t * enc)672 srl_init_freezeobj_svhash(srl_encoder_t *enc)
673 {
674     enc->freezeobj_svhash = PTABLE_new_size(3);
675     return enc->freezeobj_svhash;
676 }
677 
678 SRL_STATIC_INLINE HV *
srl_init_string_deduper_hv(pTHX_ srl_encoder_t * enc)679 srl_init_string_deduper_hv(pTHX_ srl_encoder_t *enc)
680 {
681     enc->string_deduper_hv = newHV();
682     return enc->string_deduper_hv;
683 }
684 
685 
686 void
srl_write_header(pTHX_ srl_encoder_t * enc,SV * user_header_src,const U32 compress_flags)687 srl_write_header(pTHX_ srl_encoder_t *enc, SV *user_header_src, const U32 compress_flags)
688 {
689     /* 4th to 8th bit are flags. Using 4th for snappy flag. FIXME needs to go in spec. */
690 
691     U8 flags= srl_get_compression_header_flag(compress_flags);
692     const U8 version_and_flags = (U8)enc->protocol_version | flags;
693 
694     /* 4 byte magic string + proto version
695      * + potentially uncompressed size varint
696      * +  1 byte varint that indicates zero-length header */
697     BUF_SIZE_ASSERT(&enc->buf, sizeof(SRL_MAGIC_STRING) + 1 + 1);
698     if (expect_true( enc->protocol_version > 2 ))
699       srl_buf_cat_str_s_nocheck(&enc->buf, SRL_MAGIC_STRING_HIGHBIT);
700     else
701       srl_buf_cat_str_s_nocheck(&enc->buf, SRL_MAGIC_STRING);
702     srl_buf_cat_char_nocheck(&enc->buf, version_and_flags);
703     if (user_header_src == NULL) {
704         srl_buf_cat_char_nocheck(&enc->buf, '\0'); /* variable header length (0 right now) */
705     }
706     else {
707         STRLEN user_data_len;
708 
709         if (expect_false( enc->protocol_version < 2 ))
710             croak("Cannot serialize user header data in Sereal protocol V1 mode!");
711 
712         /* Allocate tmp buffer for swapping if necessary,
713          * will be cleaned up automatically */
714         if (enc->tmp_buf.start == NULL)
715             srl_buf_init_buffer(aTHX_ &enc->tmp_buf, INITIALIZATION_SIZE);
716 
717         /* Write document body (for header) into separate buffer */
718         srl_buf_swap_buffer(aTHX_ &enc->tmp_buf, &enc->buf);
719         SRL_ENC_UPDATE_BODY_POS(enc);
720         srl_dump_sv(aTHX_ enc, user_header_src);
721         srl_fixup_weakrefs(aTHX_ enc); /* more bodies to follow */
722         srl_clear_seen_hashes(aTHX_ enc); /* more bodies to follow */
723 
724         /* Swap main buffer back in, encode header length&bitfield, copy user header data */
725         user_data_len = BUF_POS_OFS(&enc->buf);
726         srl_buf_swap_buffer(aTHX_ &enc->buf, &enc->tmp_buf);
727 
728         BUF_SIZE_ASSERT(&enc->buf, user_data_len + 1 + SRL_MAX_VARINT_LENGTH); /* +1 for bit field, +X for header len */
729 
730         /* Encode header length */
731         srl_buf_cat_varint_nocheck(aTHX_ &enc->buf, 0, (UV)(user_data_len + 1)); /* +1 for bit field */
732         /* Encode bitfield */
733         srl_buf_cat_char_nocheck(&enc->buf, '\1');
734         /* Copy user header data */
735         Copy(enc->tmp_buf.start, enc->buf.pos, user_data_len, char);
736         enc->buf.pos += user_data_len;
737 
738         enc->tmp_buf.pos = enc->tmp_buf.start; /* reset tmp buffer just to be clean */
739     }
740 }
741 
742 /* The following is to handle the fact that under normal build options
743  * VC6 will compare all floating point at 80 bits of precision, regardless
744  * regardless of the type.
745  * By setting the vars to "volatile" we avoid this behavior.
746  * Hopefully this fixes various remaining Win32 test failures we see.
747  *
748  * Note this patch could not have been written without Bulk88's help.
749  * Thanks a lot man!
750  *
751  * Comment from Bulk88:
752  * -O1 and -O2 tested and both of those 2 "failed"
753  * -Op - Improve Float Consistency does not have the bug
754  * Problem not seen in VC 2003
755  * I (Bulk88) don't have a VC 2002 to test v13 officially
756  *
757  */
758 #if defined(_MSC_VER)
759 #   if _MSC_VER < 1300
760 #       define MS_VC6_WORKAROUND_VOLATILE volatile
761 #   else
762 #       define MS_VC6_WORKAROUND_VOLATILE
763 #   endif
764 #else
765 #   define MS_VC6_WORKAROUND_VOLATILE
766 #endif
767 
768 
769 /* Code for serializing floats */
770 SRL_STATIC_INLINE void
srl_dump_nv(pTHX_ srl_encoder_t * enc,SV * src)771 srl_dump_nv(pTHX_ srl_encoder_t *enc, SV *src)
772 {
773     NV nv= SvNV(src);
774     MS_VC6_WORKAROUND_VOLATILE float f= (float)nv;
775     MS_VC6_WORKAROUND_VOLATILE double d= (double)nv;
776     /* TODO: this logic could be reworked to not duplicate so much code, which will help on win32 */
777     if ( f == nv || nv != nv ) {
778         BUF_SIZE_ASSERT(&enc->buf, 1 + sizeof(f)); /* heuristic: header + string + simple value */
779         srl_buf_cat_char_nocheck(&enc->buf, SRL_HDR_FLOAT);
780         Copy((char *)&f, enc->buf.pos, sizeof(f), char);
781         enc->buf.pos += sizeof(f);
782     } else if (d == nv) {
783         BUF_SIZE_ASSERT(&enc->buf, 1 + sizeof(d)); /* heuristic: header + string + simple value */
784         srl_buf_cat_char_nocheck(&enc->buf, SRL_HDR_DOUBLE);
785         Copy((char *)&d, enc->buf.pos, sizeof(d), char);
786         enc->buf.pos += sizeof(d);
787     } else {
788         BUF_SIZE_ASSERT(&enc->buf, 1 + sizeof(nv)); /* heuristic: header + string + simple value */
789         srl_buf_cat_char_nocheck(&enc->buf, SRL_HDR_LONG_DOUBLE);
790         Copy((char *)&nv, enc->buf.pos, sizeof(nv), char);
791 #if SRL_EXTENDED_PRECISION_LONG_DOUBLE
792         /* x86 uses an 80 bit extended precision. on 64 bit machines
793          * this is 16 bytes long, and on 32 bits its is 12 bytes long.
794          * the unused 2/6 bytes are not necessarily zeroed, potentially
795          * allowing internal memory to be exposed. We therefore zero
796          * the unused bytes here. */
797         memset(enc->buf.pos+10, 0, sizeof(nv) - 10);
798 #endif
799         enc->buf.pos += sizeof(nv);
800     }
801 }
802 
803 
804 /* Code for serializing any SINGLE integer type */
805 SRL_STATIC_INLINE void
srl_dump_ivuv(pTHX_ srl_encoder_t * enc,SV * src)806 srl_dump_ivuv(pTHX_ srl_encoder_t *enc, SV *src)
807 {
808     char hdr;
809     /* TODO for the time being, we just won't ever use NUMLIST types because that's
810      *      a fair amount of extra implementation work. The decoders won't care and
811      *      we're just wasting some space. */
812     /* TODO optimize! */
813 
814     /* FIXME find a way to express the condition without repeated SvIV/SvUV */
815     if (expect_true( SvIOK_UV(src) || SvIV(src) >= 0 )) {
816         const UV num = SvUV(src); /* FIXME is SvUV_nomg good enough because of the GET magic in dump_sv? SvUVX after having checked the flags? */
817         if (num <= 15) {
818             /* encodable as POS */
819             hdr = SRL_HDR_POS_LOW | (unsigned char)num;
820             srl_buf_cat_char(&enc->buf, hdr);
821         }
822         else {
823             srl_buf_cat_varint(aTHX_ &enc->buf, SRL_HDR_VARINT, num);
824         }
825     }
826     else {
827         const IV num = SvIV(src);
828         if (num >= -16) {
829             /* encodable as NEG */
830             hdr = SRL_HDR_NEG_LOW | ((unsigned char)num + 32);
831             srl_buf_cat_char(&enc->buf, hdr);
832         }
833         else {
834             /* Needs ZIGZAG */
835             srl_buf_cat_zigzag(aTHX_ &enc->buf, SRL_HDR_ZIGZAG, num);
836         }
837     }
838 }
839 
840 /* Dumps the tag and class name of an object doing all necessary callbacks or
841  * exception-throwing.
842  * The provided SV must already have been identified as a Perl object
843  * using sv_isobject().
844  * If the return value is not NULL, then it's the actual object content that
845  * needs to be serialized by the caller. */
846 SRL_STATIC_INLINE SV *
srl_get_frozen_object(pTHX_ srl_encoder_t * enc,SV * src,SV * referent)847 srl_get_frozen_object(pTHX_ srl_encoder_t *enc, SV *src, SV *referent)
848 {
849     assert(sv_isobject(src)); /* duplicate asserts are "free" */
850 
851     /* Check for FREEZE support */
852     if (expect_false( SRL_ENC_HAVE_OPTION(enc, SRL_F_ENABLE_FREEZE_SUPPORT) )) {
853         HV *stash = SvSTASH(referent);
854         GV *method = NULL;
855         assert(stash != NULL);
856         method = gv_fetchmethod_autoload(stash, "FREEZE", 0);
857 
858         if (expect_false( method != NULL )) {
859             SV *replacement= NULL;
860             PTABLE_t *freezeobj_svhash = SRL_GET_FREEZEOBJ_SVHASH(enc);
861             if (SvREFCNT(referent)>1) {
862                 replacement= (SV *) PTABLE_fetch(freezeobj_svhash, referent);
863             }
864             if (!replacement) {
865                 int count;
866                 dSP;
867                 ENTER;
868                 SAVETMPS;
869                 PUSHMARK(SP);
870 
871                 EXTEND(SP, 2);
872                 PUSHs(src);
873                 PUSHs(enc->sereal_string_sv); /* not NULL if SRL_F_ENABLE_FREEZE_SUPPORT is set */
874                 replacement= (SV*)newAV();
875                 PTABLE_store(freezeobj_svhash, referent, replacement);
876 
877                 PUTBACK;
878                 count = call_sv((SV *)GvCV(method), G_ARRAY);
879                 /* TODO explore method lookup caching */
880                 SPAGAIN;
881 
882                 while ( count-- > 0) {
883                     SV *tmp = POPs;
884                     SvREFCNT_inc(tmp);
885                     if (!av_store((AV*)replacement,count,tmp))
886                         croak("Failed to push value into array");
887                 }
888 
889                 PUTBACK;
890                 FREETMPS;
891                 LEAVE;
892             }
893             return replacement;
894         }
895     }
896     return NULL;
897 
898 }
899 
900 /* Outputs a bless header and the class name (as some form of string or COPY).
901  * Caller then has to output the actual reference payload.
902  * If it returns 1 it means the classname was written out and should NOT
903  * be overwritten by the ref rewrite logic (which handles REFP).
904  * If it returns 0 it means no classname was output. */
905 SRL_STATIC_INLINE int
srl_dump_classname(pTHX_ srl_encoder_t * enc,SV * referent,SV * replacement)906 srl_dump_classname(pTHX_ srl_encoder_t *enc, SV *referent, SV *replacement)
907 {
908     /* Check that we actually want to support objects */
909     if (expect_false( SRL_ENC_HAVE_OPTION(enc, SRL_F_CROAK_ON_BLESS)) ) {
910         croak("Attempted to serialize blessed reference. Serializing objects "
911                 "using Sereal::Encoder was explicitly disabled using the "
912                 "'croak_on_bless' option.");
913     } else if (expect_false( SRL_ENC_HAVE_OPTION(enc, SRL_F_NO_BLESS_OBJECTS) )) {
914         return 0;
915     } else {
916         const HV *stash = SvSTASH(referent);
917         PTABLE_t *string_seenhash = SRL_GET_STR_PTR_SEENHASH(enc);
918         svtype svt= SvTYPE(referent);
919         int is_av_or_hv= (svt == SVt_PVAV || svt== SVt_PVHV);
920         ptrdiff_t oldoffset= is_av_or_hv
921                            ? 0
922                            : (ptrdiff_t)PTABLE_fetch(string_seenhash, referent);
923 
924         if (oldoffset) {
925             return 0;
926         } else {
927             svt= replacement ? SvTYPE(replacement) : SvTYPE(referent);
928             if (SRL_UNSUPPORTED_SvTYPE(svt)) {
929                 return 0;
930             }
931             oldoffset= (ptrdiff_t)PTABLE_fetch(string_seenhash, (SV *)stash);
932         }
933 
934         if (oldoffset != 0) {
935             /* Issue COPY instead of literal class name string */
936             srl_buf_cat_varint(aTHX_ &enc->buf,
937                                      expect_false(replacement) ? SRL_HDR_OBJECTV_FREEZE : SRL_HDR_OBJECTV,
938                                      (UV)oldoffset);
939         }
940         else {
941             const char *class_name = HvNAME_get(stash);
942             const size_t len = HvNAMELEN_get(stash);
943 
944             /* First save this new string (well, the HV * that it is represented by) into the string
945              * dedupe table.
946              * By saving the ptr to the HV, we only dedupe class names with class names, though
947              * this seems a small price to pay for not having to keep a full string table.
948              * At least, we can safely use the same PTABLE to store the ptrs to hashkeys since
949              * the set of pointers will never collide.
950              * /me bows to Yves for the delightfully evil hack. */
951             srl_buf_cat_char(&enc->buf, expect_false(replacement) ? SRL_HDR_OBJECT_FREEZE : SRL_HDR_OBJECT);
952 
953             /* remember current offset before advancing it */
954             PTABLE_store(string_seenhash, (void *)stash, INT2PTR(void *, BODY_POS_OFS(&enc->buf)));
955 
956             /* HvNAMEUTF8 not in older perls and it would be 0 for those anyway */
957 #if PERL_VERSION >= 16
958             srl_dump_pv(aTHX_ enc, class_name, len, HvNAMEUTF8(stash));
959 #else
960             srl_dump_pv(aTHX_ enc, class_name, len, 0);
961 #endif
962         }
963         if (is_av_or_hv) {
964             return 0;
965         } else {
966             /* use the string_seenhash to track which items we have seen before */
967             PTABLE_store(string_seenhash, (void *)referent, INT2PTR(void *, BODY_POS_OFS(&enc->buf)));
968             return 1;
969         }
970     }
971     return 0;
972 }
973 
974 
975 /* Prepare encoder for encoding: Clone if already in use since
976  * encoders aren't "reentrant". Set as in use and register cleanup
977  * routine with Perl. */
978 SRL_STATIC_INLINE srl_encoder_t *
srl_prepare_encoder(pTHX_ srl_encoder_t * enc)979 srl_prepare_encoder(pTHX_ srl_encoder_t *enc)
980 {
981     /* Check whether encoder is in use and create a new one on the
982      * fly if necessary. Should only happen in edge cases such as
983      * FREEZE hooks that serialize things using the same encoder
984      * object. */
985     if (SRL_ENC_HAVE_OPER_FLAG(enc, SRL_OF_ENCODER_DIRTY)) {
986         srl_encoder_t * const proto = enc;
987         enc = srl_build_encoder_struct_alike(aTHX_ proto);
988         SRL_ENC_RESET_OPTION(enc, SRL_F_REUSE_ENCODER);
989     }
990     /* Set to being in use */;
991     SRL_ENC_SET_OPER_FLAG(enc, SRL_OF_ENCODER_DIRTY);
992 
993     /* Register our structure for destruction on scope exit */
994     SAVEDESTRUCTOR_X(&srl_destructor_hook, (void *)enc);
995 
996     return enc;
997 }
998 
999 SRL_STATIC_INLINE srl_encoder_t *
srl_dump_data_structure(pTHX_ srl_encoder_t * enc,SV * src,SV * user_header_src)1000 srl_dump_data_structure(pTHX_ srl_encoder_t *enc, SV *src, SV *user_header_src)
1001 {
1002     U32 compress_flags;
1003 
1004     enc = srl_prepare_encoder(aTHX_ enc);
1005     compress_flags= SRL_ENC_HAVE_OPTION(enc, SRL_F_COMPRESS_FLAGS_MASK);
1006 
1007     if (expect_false(compress_flags))
1008     { /* Have some sort of compression */
1009         ptrdiff_t sereal_header_len;
1010         STRLEN uncompressed_body_length;
1011         const STRLEN max_len = (1L << 32) - 1;
1012 
1013         /* Alas, have to write entire packet first since the header length
1014          * will determine offsets. */
1015         srl_write_header(aTHX_ enc, user_header_src, compress_flags);
1016         sereal_header_len = BUF_POS_OFS(&enc->buf);
1017         SRL_ENC_UPDATE_BODY_POS(enc);
1018         srl_dump_sv(aTHX_ enc, src);
1019         srl_fixup_weakrefs(aTHX_ enc);
1020         assert(BUF_POS_OFS(&enc->buf) > sereal_header_len);
1021         uncompressed_body_length = BUF_POS_OFS(&enc->buf) - sereal_header_len;
1022 
1023         if ((uncompressed_body_length < (STRLEN)enc->compress_threshold) || uncompressed_body_length > max_len) {
1024             if (uncompressed_body_length > max_len) {
1025                 /* we dont support SNAPPY on super long buffers, it has a 2**32 limit
1026                  * and we currently don't support splitting things up. See Issue #88 */
1027                 warn("disabling SNAPPY compression as buffer is too large!");
1028             }
1029             /* Don't bother with compression at all if we have less than $threshold bytes of payload */
1030             srl_reset_compression_header_flag(&enc->buf);
1031         }
1032         else { /* Do Snappy or zlib compression of body */
1033             srl_compress_body(aTHX_ &enc->buf, sereal_header_len,
1034                               compress_flags, enc->compress_level,
1035                               &enc->snappy_workmem);
1036 
1037             SRL_ENC_UPDATE_BODY_POS(enc);
1038             DEBUG_ASSERT_BUF_SANE(&enc->buf);
1039         }
1040     } /* End of "want compression?" */
1041     else
1042     {
1043         srl_write_header(aTHX_ enc, user_header_src, compress_flags);
1044         SRL_ENC_UPDATE_BODY_POS(enc);
1045         srl_dump_sv(aTHX_ enc, src);
1046         srl_fixup_weakrefs(aTHX_ enc);
1047     }
1048 
1049     /* NOT doing a
1050      *   SRL_ENC_RESET_OPER_FLAG(enc, SRL_OF_ENCODER_DIRTY);
1051      * here because we're relying on the SAVEDESTRUCTOR_X call. */
1052     return enc;
1053 }
1054 
1055 SV *
srl_dump_data_structure_mortal_sv(pTHX_ srl_encoder_t * enc,SV * src,SV * user_header_src,const U32 flags)1056 srl_dump_data_structure_mortal_sv(pTHX_ srl_encoder_t *enc, SV *src, SV *user_header_src, const U32 flags)
1057 {
1058     assert(enc);
1059     enc = srl_dump_data_structure(aTHX_ enc, src, user_header_src);
1060     assert(enc->buf.start && enc->buf.pos && enc->buf.pos > enc->buf.start);
1061 
1062     if ( flags && /* for now simpler and equivalent to: flags == SRL_ENC_SV_REUSE_MAYBE */
1063          (BUF_POS_OFS(&enc->buf) > 20 && BUF_SPACE(&enc->buf) < BUF_POS_OFS(&enc->buf) )
1064     ){
1065         /* If not wasting more than 2x memory - FIXME fungible */
1066         SV *sv = sv_2mortal(newSV_type(SVt_PV));
1067         SvPV_set(sv, (char *) enc->buf.start);
1068         SvLEN_set(sv, BUF_SIZE(&enc->buf));
1069         SvCUR_set(sv, BUF_POS_OFS(&enc->buf));
1070         SvPOK_on(sv);
1071         enc->buf.start = enc->buf.pos = NULL; /* no need to free these guys now */
1072         return sv;
1073     }
1074 
1075     return sv_2mortal(newSVpvn((char *)enc->buf.start, (STRLEN)BUF_POS_OFS(&enc->buf)));
1076 }
1077 
1078 SRL_STATIC_INLINE void
srl_fixup_weakrefs(pTHX_ srl_encoder_t * enc)1079 srl_fixup_weakrefs(pTHX_ srl_encoder_t *enc)
1080 {
1081     PTABLE_t *weak_seenhash = SRL_GET_WEAK_SEENHASH_OR_NULL(enc);
1082     if (!weak_seenhash)
1083         return;
1084 
1085     {
1086         PTABLE_ITER_t *it = PTABLE_iter_new(weak_seenhash);
1087         PTABLE_ENTRY_t *ent;
1088 
1089         /* we now walk the weak_seenhash and set any tags it points
1090          * at to the PAD opcode, this basically turns the first weakref
1091          * we encountered into a normal ref when there is only a weakref
1092          * pointing at the structure. */
1093         while ( NULL != (ent = PTABLE_iter_next(it)) ) {
1094             const ptrdiff_t offset = (ptrdiff_t)ent->value;
1095             if ( offset ) {
1096                 srl_buffer_char *pos = enc->buf.body_pos + offset;
1097                 assert(*pos == SRL_HDR_WEAKEN);
1098                 if (DEBUGHACK) warn("setting byte at offset %"UVuf" to PAD", (UV)offset);
1099                 *pos = SRL_HDR_PAD;
1100             }
1101         }
1102 
1103         PTABLE_iter_free(it);
1104     }
1105 }
1106 
1107 
1108 
1109 static inline void
srl_dump_regexp(pTHX_ srl_encoder_t * enc,SV * sv)1110 srl_dump_regexp(pTHX_ srl_encoder_t *enc, SV *sv)
1111 {
1112     STRLEN left = 0;
1113     const char *fptr;
1114     char ch;
1115     U16 match_flags;
1116 #ifdef MODERN_REGEXP
1117     REGEXP *re= SvRX(sv);
1118 #else
1119     regexp *re = (regexp *)(((MAGIC*)sv)->mg_obj);
1120 #endif
1121 
1122     char reflags[sizeof(INT_PAT_MODS) + MAX_CHARSET_NAME_LENGTH];
1123 
1124     /*
1125        we are in list context so stringify
1126        the modifiers that apply. We ignore "negative
1127        modifiers" in this scenario, and the default character set
1128     */
1129 
1130 #ifdef REGEXP_DEPENDS_CHARSET
1131     if (get_regex_charset(RX_EXTFLAGS(re)) != REGEX_DEPENDS_CHARSET) {
1132         STRLEN len;
1133         const char* const name = get_regex_charset_name(RX_EXTFLAGS(re),
1134                                                         &len);
1135         Copy(name, reflags + left, len, char);
1136         left += len;
1137     }
1138 #endif
1139     fptr = INT_PAT_MODS;
1140     match_flags = (U16)((RX_EXTFLAGS(re) & RXf_PMf_COMPILETIME)
1141                             >> RXf_PMf_STD_PMMOD_SHIFT);
1142 
1143     while((ch = *fptr++)) {
1144         if(match_flags & 1) {
1145             reflags[left++] = ch;
1146         }
1147         match_flags >>= 1;
1148     }
1149 
1150     srl_buf_cat_char(&enc->buf, SRL_HDR_REGEXP);
1151     srl_dump_pv(aTHX_ enc, RX_PRECOMP(re),RX_PRELEN(re), (RX_UTF8(re) ? SVf_UTF8 : 0));
1152     srl_dump_pv(aTHX_ enc, reflags, left, 0);
1153     return;
1154 }
1155 
1156 #define ASSUME_BYTES_PER_TAG 4
1157 #define BUF_SIZE_ASSERT_AV(b,n) \
1158         BUF_SIZE_ASSERT((b), 2 + SRL_MAX_VARINT_LENGTH + (1 * ASSUME_BYTES_PER_TAG * (n) ) )
1159 /* heuristic: 6 * n = liberal estimate of min size of n hashkeys */
1160 #define BUF_SIZE_ASSERT_HV(b, n) \
1161         BUF_SIZE_ASSERT((b), 2 + SRL_MAX_VARINT_LENGTH + (2 * ASSUME_BYTES_PER_TAG * (n) ) )
1162 
1163 SRL_STATIC_INLINE void
srl_dump_av(pTHX_ srl_encoder_t * enc,AV * src,U32 refcount)1164 srl_dump_av(pTHX_ srl_encoder_t *enc, AV *src, U32 refcount)
1165 {
1166     UV n;
1167     SV **svp;
1168 
1169     n = av_len(src)+1;
1170 
1171     /* heuristic: n is virtually the min. size of any element */
1172     BUF_SIZE_ASSERT_AV(&enc->buf, n);
1173 
1174     if (n < 16 && refcount == 1 && !SRL_ENC_HAVE_OPTION(enc,SRL_F_CANONICAL_REFS)) {
1175         enc->buf.pos--; /* backup over previous REFN */
1176         srl_buf_cat_char_nocheck(&enc->buf, SRL_HDR_ARRAYREF + n);
1177     } else {
1178         /* header and num. elements */
1179         srl_buf_cat_varint_nocheck(aTHX_ &enc->buf, SRL_HDR_ARRAY, n);
1180     }
1181     if (!n)
1182         return;
1183     /* I can't decide if this should make me feel dirty */
1184     if (SvMAGICAL(src)) {
1185         UV i;
1186         for (i = 0; i < n; ++i) {
1187             svp = av_fetch(src, i, 0);
1188             CALL_SRL_DUMP_SVP(enc, svp);
1189         }
1190     } else {
1191         SV **end;
1192         svp= AvARRAY(src);
1193         end= svp + n;
1194         for ( ; svp < end ; svp++) {
1195             /* we cannot have a null *svp so we do not use CALL_SRL_DUMP_SVP() here */
1196             CALL_SRL_DUMP_SV(enc, *svp);
1197         }
1198     }
1199 }
1200 
1201 SRL_STATIC_INLINE void
srl_dump_hv_unsorted_nomg(pTHX_ srl_encoder_t * enc,HV * src,UV n)1202 srl_dump_hv_unsorted_nomg(pTHX_ srl_encoder_t *enc, HV *src, UV n)
1203 {
1204     HE *he;
1205     const int do_share_keys = HvSHAREKEYS((SV *)src);
1206     HE **he_ptr= HvARRAY(src);
1207     HE **he_end= he_ptr + HvMAX(src) + 1;
1208 
1209     do {
1210         for (he= *he_ptr++; he; he= HeNEXT(he) ) {
1211             SV *v= HeVAL(he);
1212             if (v != &PL_sv_placeholder) {
1213                 srl_dump_hk(aTHX_ enc, he, do_share_keys);
1214                 CALL_SRL_DUMP_SV(enc, v);
1215                 if (--n == 0) {
1216                     he_ptr= he_end;
1217                     break;
1218                 }
1219             }
1220         }
1221     } while ( he_ptr < he_end );
1222 }
1223 
1224 SRL_STATIC_INLINE void
srl_dump_hv_unsorted_mg(pTHX_ srl_encoder_t * enc,HV * src,const UV n)1225 srl_dump_hv_unsorted_mg(pTHX_ srl_encoder_t *enc, HV *src, const UV n)
1226 {
1227     HE *he;
1228     UV i= 0;
1229     const int do_share_keys = HvSHAREKEYS((SV *)src);
1230 
1231     (void)hv_iterinit(src); /* return value not reliable according to API docs */
1232     while ((he = hv_iternext(src))) {
1233         SV *v;
1234         if (expect_false( i == n ))
1235             croak("Panic: cannot serialize a tied hash which changes its size!");
1236         v= hv_iterval(src, he);
1237         srl_dump_hk(aTHX_ enc, he, do_share_keys);
1238         CALL_SRL_DUMP_SV(enc, v);
1239         ++i;
1240     }
1241     if (expect_false( i != n ))
1242         croak("Panic: cannot serialize a tied hash which changes its size!");
1243 }
1244 
1245 /* sorting hashes - nothing in perl is easy. ever.
1246  *
1247  * Some things to keep in mind about perl hashes as you read this code:
1248  *
1249  * Hashes may be shared or not. Usually shared. This means they share their
1250  * key data via PL_strtab.
1251  *
1252  * Hashes may be tied or not. Usually not. When tied the keys from the hash
1253  * are available only as SV *'s, and when untied, the keys from the hash are
1254  * accessed via HE *'s.
1255  *
1256  * Some HE's actually contains SV's but most contain a ptr/len combo with
1257  * an utf8 flag. To make things even more interesting utf8 keys are
1258  * normalized to latin1 by perl where possible before being stored in the HE,
1259  * with the utf8 flag indicating "was utf8" instead of "is utf8" or "not utf8".
1260  *
1261  * The complexity about accessing the key for a hash can be managed away by
1262  * perl via API's like hv_iterkeysv(), but using that means constructing mortal
1263  * SV's for each key as we go.
1264  *
1265  * We could in theory use the HePV() interface, but one annoying result of the
1266  * "was utf8" logic is that we can't use a sort comparator which looks
1267  * at the raw binary of the keys when the keys might contain utf8. A utf8 key
1268  * like "\xDF" will be downgraded to ascii in the HE form, but will be upgraded
1269  * to the utf8 representation in the SV form. So if we want to do "fast" sorting
1270  * we have to restrict it to non-utf8/non-sv keys, and force the use of the SV
1271  * based API (which we have to use for tie's anyway) when we see a UTF8 key.
1272  *
1273  * Which is what we do below. In order to sort a hash we need to construct an
1274  * array of its contents, in srl_dump_sorted_nomg() we walk the hash, checking
1275  * each key, and copying each HE over into a scratch buffer which it then sorts.
1276  * If during the transcription process it sees any utf8 or SV keys it exits
1277  * immediately, and falls through to srl_dump_sort_mg(), which uses hv_iterkeysv()
1278  * to construct an array of HE_SV instead, which we then sort.
1279  */
1280 
1281 
1282 
1283 SRL_STATIC_INLINE int
he_islt(const HE * a,const HE * b)1284 he_islt(const HE *a, const HE *b)
1285 {
1286     /* no need for a dTHX here, we don't use anything that needs it */
1287     const STRLEN la = HeKLEN(a);
1288     const STRLEN lb = HeKLEN(b);
1289     const int cmp = memcmp(HeKEY(a), HeKEY(b), la < lb ? la : lb);
1290     if (cmp) {
1291         return cmp < 0;
1292     } else {
1293         return la < lb;
1294     }
1295 }
1296 
1297 SRL_STATIC_INLINE int
he_sv_islt_fast(const HE_SV * a,const HE_SV * b)1298 he_sv_islt_fast(const HE_SV *a, const HE_SV *b)
1299 {
1300     /* no need for a dTHX here, we don't use anything that needs it */
1301     char *a_ptr;
1302     char *b_ptr;
1303     int a_isutf8;
1304     int b_isutf8;
1305     const STRLEN a_len= a->key.sv ? SvCUR(a->key.sv) : HeKLEN(a->val.he);
1306     const STRLEN b_len= b->key.sv ? SvCUR(b->key.sv) : HeKLEN(b->val.he);
1307     if (a_len != b_len) {
1308         return a_len < b_len;
1309     }
1310     a_isutf8= (a->key.sv ? SvUTF8(a->key.sv) : HeKUTF8(a->val.he)) ? 0 : 1;
1311     b_isutf8= (b->key.sv ? SvUTF8(b->key.sv) : HeKUTF8(b->val.he)) ? 0 : 1;
1312     if (a_isutf8 != b_isutf8) {
1313         return a_isutf8 < b_isutf8;
1314     }
1315     a_ptr= a->key.sv ? SvPVX(a->key.sv) : HeKEY(a->val.he);
1316     b_ptr= b->key.sv ? SvPVX(b->key.sv) : HeKEY(b->val.he);
1317     return memcmp(a_ptr, b_ptr, a_len < b_len ? a_len : b_len ) < 0;
1318 }
1319 
1320 #define ISLT_HE_SV(a,b)    he_sv_islt_fast( a, b )
1321 #define ISLT_SV_CMP(a,b)   sv_cmp(a->key.sv, b->key.sv) == sort_dir
1322 
1323 
1324 SRL_STATIC_INLINE void
srl_qsort(pTHX_ srl_encoder_t * enc,const UV n,HE_SV * array)1325 srl_qsort(pTHX_ srl_encoder_t *enc, const UV n, HE_SV *array)
1326 {
1327     if ( SRL_ENC_HAVE_OPTION(enc, SRL_F_SORT_KEYS_PERL) ) {
1328         int sort_dir= SRL_ENC_HAVE_OPTION(enc, SRL_F_SORT_KEYS_PERL_REV) ? 1 : -1;
1329         /* hack to forcefully disable "use bytes" */
1330         COP cop= *PL_curcop;
1331         cop.op_private= 0;
1332 
1333         ENTER;
1334         SAVETMPS;
1335 
1336         SAVEVPTR (PL_curcop);
1337         PL_curcop= &cop;
1338 
1339         /* now sort */
1340         QSORT(HE_SV, array, n, ISLT_SV_CMP);
1341 
1342         FREETMPS;
1343         LEAVE;
1344     } else {
1345         /* now sort */
1346         QSORT(HE_SV, array, n, ISLT_HE_SV);
1347     }
1348 }
1349 
1350 
1351 SRL_STATIC_INLINE void
srl_dump_hv_sorted_sv_slow(pTHX_ srl_encoder_t * enc,HV * src,const UV n,HE_SV * array)1352 srl_dump_hv_sorted_sv_slow(pTHX_ srl_encoder_t *enc, HV *src, const UV n, HE_SV *array)
1353 {
1354     HE *he;
1355     UV i= 0;
1356     const int do_share_keys = HvSHAREKEYS((SV *)src);
1357     const int is_tie= !array;
1358 
1359     /* This sub is used for ties, and for hashes with SV keys in them,
1360      * and when the user requests SORT_KEYS_PERL, it is the slowest way
1361      * and most memory hungry way to serialize a hash. We will use the
1362      * full perl api for extracting the contents of the hash, which fortifies
1363      * us against ties, and we will convert all keys into mortal
1364      * sv's where necessary. This means we can use sv_cmp on the keys
1365      * if we wish.
1366      */
1367 
1368     (void)hv_iterinit(src); /* return value not reliable according to API docs */
1369     {
1370         HE_SV *array_end;
1371         if (!array) {
1372             Newx(array, n, HE_SV);
1373             SAVEFREEPV(array);
1374         }
1375         array_end= array + n;
1376         while ((he = hv_iternext(src))) {
1377             if (expect_false( i == n ))
1378                 croak("Panic: cannot serialize a %s hash which changes its size!",is_tie ? "tied" : "untied");
1379             array[i].key.sv= hv_iterkeysv(he);
1380             array[i].val.sv= hv_iterval(src,he);
1381             i++;
1382         }
1383         if (expect_false( i != n ))
1384             croak("Panic: can not serialize a %s hash which changes it size!", is_tie ? "tied" : "untied");
1385 
1386         srl_qsort(aTHX_ enc, n, array);
1387 
1388         while ( array < array_end ) {
1389             CALL_SRL_DUMP_SV(enc, array->key.sv);
1390             CALL_SRL_DUMP_SV(enc, array->val.sv);
1391             array++;
1392         }
1393     }
1394 }
1395 
1396 
1397 SRL_STATIC_INLINE void
srl_dump_hv_sorted_nomg(pTHX_ srl_encoder_t * enc,HV * src,const UV n)1398 srl_dump_hv_sorted_nomg(pTHX_ srl_encoder_t *enc, HV *src, const UV n)
1399 {
1400     HE *he;
1401     const int do_share_keys = HvSHAREKEYS((SV *)src);
1402 
1403     /* This sub is used only for untied hashes and when the user wants
1404      * sorted keys, but not necessarily the order that perl would use.
1405      */
1406 
1407     (void)hv_iterinit(src); /* return value not reliable according to API docs */
1408     {
1409         HE_SV *array;
1410         HE_SV *array_ptr;
1411         HE_SV *array_end;
1412         Newx(array, n, HE_SV);
1413         SAVEFREEPV(array);
1414         array_ptr = array;
1415         while ((he = hv_iternext(src))) {
1416             if ( HeKWASUTF8(he) ) {
1417                 array_ptr->key.sv= hv_iterkeysv(he);
1418             } else {
1419                 array_ptr->key.sv = HeSVKEY(he);
1420             }
1421             array_ptr->val.he = he;
1422             array_ptr++;
1423         }
1424 
1425         srl_qsort(aTHX_ enc, n, array);
1426 
1427         array_end = array + n;
1428         for ( array_end= array + n; array < array_end; array++ ) {
1429             SV *v;
1430             he = array->val.he;
1431             v = hv_iterval(src, he);
1432             srl_dump_hk(aTHX_ enc, he, do_share_keys);
1433             CALL_SRL_DUMP_SV(enc, v);
1434         }
1435     }
1436 }
1437 
1438 SRL_STATIC_INLINE void
srl_dump_hv(pTHX_ srl_encoder_t * enc,HV * src,U32 refcount)1439 srl_dump_hv(pTHX_ srl_encoder_t *enc, HV *src, U32 refcount)
1440 {
1441     HE *he;
1442     UV n;
1443     if ( SvMAGICAL(src) ) {
1444         /* for tied hashes, we have to iterate to find the number of entries. Alas... */
1445         n= 0;
1446         (void)hv_iterinit(src); /* return value not reliable according to API docs */
1447         while ((he = hv_iternext(src))) { ++n; }
1448     }
1449     else {
1450         n= HvUSEDKEYS(src);
1451     }
1452 
1453     BUF_SIZE_ASSERT_HV(&enc->buf, n);
1454     if (n < 16 && refcount == 1 && !SRL_ENC_HAVE_OPTION(enc,SRL_F_CANONICAL_REFS)) {
1455         enc->buf.pos--; /* backup over the previous REFN */
1456         srl_buf_cat_char_nocheck(&enc->buf, SRL_HDR_HASHREF + n);
1457     } else {
1458         srl_buf_cat_varint_nocheck(aTHX_ &enc->buf, SRL_HDR_HASH, n);
1459     }
1460 
1461     if ( n ) {
1462         if ( SvMAGICAL(src) || SRL_ENC_HAVE_OPTION(enc, SRL_F_SORT_KEYS_PERL) ) {
1463             /* SORT_KEYS_PERL implies SORT_KEYS, but we check for either just to be
1464              * careful - yves*/
1465             if ( SRL_ENC_HAVE_OPTION(enc, SRL_F_SORT_KEYS|SRL_F_SORT_KEYS_PERL) ) {
1466                 srl_dump_hv_sorted_sv_slow(aTHX_ enc, src, n, NULL);
1467             }
1468             else {
1469                 srl_dump_hv_unsorted_mg(aTHX_ enc, src, n);
1470             }
1471         }
1472         else {
1473             if ( SRL_ENC_HAVE_OPTION(enc, SRL_F_SORT_KEYS) ) {
1474                 srl_dump_hv_sorted_nomg(aTHX_ enc, src, n);
1475             }
1476             else {
1477                 srl_dump_hv_unsorted_nomg(aTHX_ enc, src, n);
1478             }
1479         }
1480     }
1481 }
1482 
1483 
1484 
1485 SRL_STATIC_INLINE void
srl_dump_hk(pTHX_ srl_encoder_t * enc,HE * src,const int share_keys)1486 srl_dump_hk(pTHX_ srl_encoder_t *enc, HE *src, const int share_keys)
1487 {
1488     char *str;
1489     STRLEN len;
1490     char mode;
1491 
1492     if (HeKLEN(src) == HEf_SVKEY) {
1493         SV *sv = HeSVKEY(src);
1494 
1495         SvGETMAGIC(sv);
1496         str = SvPV(sv, len);
1497         mode= SvUTF8(sv) ? 1 : 0;
1498 
1499     }
1500     else {
1501         str = HeKEY(src);
1502         /* This logic is an optimization for output space: We keep track of
1503          * all seen hash key strings that are in perl's shared string storage.
1504          * If we see one again, we just emit a COPY instruction.
1505          * This means that we only need to keep a ptr table since the strings
1506          * don't move in the shared key storage -- otherwise, we'd have to
1507          * compare strings / keep a full string hash table. */
1508         if ( share_keys && SRL_ENC_HAVE_OPTION(enc, SRL_F_SHARED_HASHKEYS) /* only enter branch if shared hk's enabled */
1509 #if PERL_VERSION >= 10
1510              && (!DO_SHARED_HASH_ENTRY_REFCOUNT_CHECK
1511                 || src->he_valu.hent_refcount > 1)
1512 #endif
1513             )
1514         {
1515             PTABLE_t *string_seenhash = SRL_GET_STR_PTR_SEENHASH(enc);
1516             const ptrdiff_t oldoffset = (ptrdiff_t)PTABLE_fetch(string_seenhash, str);
1517             if (oldoffset != 0) {
1518                 /* Issue COPY instead of literal hash key string */
1519                 srl_buf_cat_varint(aTHX_ &enc->buf, SRL_HDR_COPY, (UV)oldoffset);
1520                 return;
1521             }
1522             else {
1523                 /* remember current offset before advancing it */
1524                 const ptrdiff_t newoffset = BODY_POS_OFS(&enc->buf);
1525                 PTABLE_store(string_seenhash, (void *)str, INT2PTR(void *, newoffset));
1526             }
1527         }
1528         len= HeKLEN(src);
1529         mode= HeKWASUTF8(src) ? 2 :  HeKUTF8(src) ? 1 : 0;
1530     }
1531     if (mode == 2) { /* must convert back to utf8 */
1532         char* utf8= (char *)Perl_bytes_to_utf8(aTHX_ (U8 *)str, &len);
1533         srl_dump_pv(aTHX_ enc, utf8, len, 1);
1534         Safefree(utf8);
1535     } else {
1536         srl_dump_pv(aTHX_ enc, str, len, mode);
1537     }
1538 }
1539 
1540 SRL_STATIC_INLINE void
srl_dump_svpv(pTHX_ srl_encoder_t * enc,SV * src)1541 srl_dump_svpv(pTHX_ srl_encoder_t *enc, SV *src)
1542 {
1543     STRLEN len;
1544     const char * const str= SvPV(src, len);
1545     if ( SRL_ENC_HAVE_OPTION(enc, SRL_F_DEDUPE_STRINGS) && len > 3 ) {
1546         HV *string_deduper_hv= SRL_GET_STR_DEDUPER_HV(enc);
1547         HE *dupe_offset_he= hv_fetch_ent(string_deduper_hv, src, 1, 0);
1548         if (!dupe_offset_he) {
1549             croak("out of memory (hv_fetch_ent returned NULL)");
1550         } else {
1551             const char out_tag= SRL_ENC_HAVE_OPTION(enc, SRL_F_ALIASED_DEDUPE_STRINGS)
1552                                 ? SRL_HDR_ALIAS
1553                                 : SRL_HDR_COPY;
1554             SV *ofs_sv= HeVAL(dupe_offset_he);
1555             if (SvIOK(ofs_sv)) {
1556                 /* emit copy or alias */
1557                 if (out_tag == SRL_HDR_ALIAS)
1558                     SRL_SET_TRACK_FLAG(*(enc->buf.body_pos + SvUV(ofs_sv)));
1559                 srl_buf_cat_varint(aTHX_ &enc->buf, out_tag, SvIV(ofs_sv));
1560                 return;
1561             } else if (SvUOK(ofs_sv)) {
1562                 srl_buf_cat_varint(aTHX_ &enc->buf, out_tag, SvUV(ofs_sv));
1563                 return;
1564             } else {
1565                 /* start tracking this string */
1566                 sv_setuv(ofs_sv, (UV)BODY_POS_OFS(&enc->buf));
1567             }
1568         }
1569     }
1570     srl_dump_pv(aTHX_ enc, str, len, SvUTF8(src));
1571 }
1572 
1573 SRL_STATIC_INLINE void
srl_dump_pv(pTHX_ srl_encoder_t * enc,const char * src,STRLEN src_len,int is_utf8)1574 srl_dump_pv(pTHX_ srl_encoder_t *enc, const char* src, STRLEN src_len, int is_utf8)
1575 {
1576     BUF_SIZE_ASSERT(&enc->buf, 1 + SRL_MAX_VARINT_LENGTH + src_len); /* overallocate a bit sometimes */
1577     if (is_utf8) {
1578         srl_buf_cat_varint_nocheck(aTHX_ &enc->buf, SRL_HDR_STR_UTF8, src_len);
1579     } else if (src_len <= SRL_MASK_SHORT_BINARY_LEN) {
1580         srl_buf_cat_char_nocheck(&enc->buf, SRL_HDR_SHORT_BINARY_LOW | (char)src_len);
1581     } else {
1582         srl_buf_cat_varint_nocheck(aTHX_ &enc->buf, SRL_HDR_BINARY, src_len);
1583     }
1584     Copy(src, enc->buf.pos, src_len, char);
1585     enc->buf.pos += src_len;
1586 }
1587 
1588 #ifdef HAS_HV_BACKREFS
1589 AV *
srl_hv_backreferences_p_safe(pTHX_ HV * hv)1590 srl_hv_backreferences_p_safe(pTHX_ HV *hv) {
1591     if (SvOOK(hv)) {
1592         struct xpvhv_aux * const iter = HvAUX(hv);
1593         return iter->xhv_backreferences;
1594     } else {
1595         return NULL;
1596     }
1597 }
1598 #endif
1599 
1600 /* Dumps generic SVs and delegates
1601  * to more specialized functions for RVs, etc. */
1602 /* TODO decide when to use the IV, when to use the PV, and when
1603  *      to use the NV slots of the SV.
1604  *      Safest simple solution seems "prefer string" (fuck dualvars).
1605  *      Potentially better but slower: If we would choose the string,
1606  *      then try int-to-string (respective float-to-string) conversion
1607  *      and strcmp. If same, then use int or float.
1608  */
1609 static void
srl_dump_sv(pTHX_ srl_encoder_t * enc,SV * src)1610 srl_dump_sv(pTHX_ srl_encoder_t *enc, SV *src)
1611 {
1612     UV refcount;
1613     svtype svt;
1614     MAGIC *mg;
1615     AV *backrefs;
1616     SV* refsv= NULL;
1617     SV* replacement= NULL;
1618     UV weakref_ofs= 0;              /* preserved between loops */
1619     SSize_t ref_rewrite_pos= 0;      /* preserved between loops - note SSize_t is a perl define */
1620     assert(src);
1621 
1622     if (expect_false( ++enc->recursion_depth == enc->max_recursion_depth )) {
1623         croak("Hit maximum recursion depth (%"UVuf"), aborting serialization",
1624               (UV)enc->max_recursion_depth);
1625     }
1626 
1627 redo_dump:
1628     mg= NULL;
1629     backrefs= NULL;
1630     svt = SvTYPE(src);
1631     refcount = SvREFCNT(src);
1632     DEBUG_ASSERT_BUF_SANE(&enc->buf);
1633     if ( SvMAGICAL(src) ) {
1634         SvGETMAGIC(src);
1635 #ifdef HAS_HV_BACKREFS
1636         if (svt != SVt_PVHV)
1637 #endif
1638             mg = mg_find(src, PERL_MAGIC_backref);
1639     }
1640 #ifdef HAS_HV_BACKREFS
1641     if (expect_false( svt == SVt_PVHV && SvOOK(src) )) {
1642         backrefs= srl_hv_backreferences_p_safe(aTHX_ (HV *)src);
1643         if (DEBUGHACK) warn("backreferences %p", src);
1644     }
1645 #endif
1646     if (expect_false( mg || backrefs )) {
1647         PTABLE_t *weak_seenhash= SRL_GET_WEAK_SEENHASH(enc);
1648         PTABLE_ENTRY_t *pe= PTABLE_find(weak_seenhash, src);
1649         if (!pe) {
1650             /* not seen it before */
1651             if (DEBUGHACK) warn("scalar %p - is weak referent, storing %"UVuf, src, weakref_ofs);
1652             /* if weakref_ofs is false we got here some way that holds a refcount on this item */
1653             PTABLE_store(weak_seenhash, src, INT2PTR(void *, weakref_ofs));
1654         } else {
1655             if (DEBUGHACK) warn("scalar %p - is weak referent, seen before value:%"UVuf" weakref_ofs:%"UVuf,
1656                     src, (UV)pe->value, (UV)weakref_ofs);
1657             if (pe->value)
1658                 pe->value= INT2PTR(void *, weakref_ofs);
1659         }
1660         refcount++;
1661         weakref_ofs= 0;
1662     }
1663 
1664     /* check if we have seen this scalar before, and track it so
1665      * if we see it again we recognize it */
1666     if ( expect_false( refcount > 1 ) ) {
1667         if (src == &PL_sv_undef && enc->protocol_version >=3 ) {
1668             srl_buf_cat_char(&enc->buf, SRL_HDR_CANONICAL_UNDEF);
1669             --enc->recursion_depth;
1670             return;
1671         }
1672         else
1673         if (src == &PL_sv_yes) {
1674             srl_buf_cat_char(&enc->buf, SRL_HDR_TRUE);
1675             --enc->recursion_depth;
1676             return;
1677         }
1678         else
1679         if (src == &PL_sv_no) {
1680             srl_buf_cat_char(&enc->buf, SRL_HDR_FALSE);
1681             --enc->recursion_depth;
1682             return;
1683         }
1684         else {
1685             PTABLE_t *ref_seenhash= SRL_GET_REF_SEENHASH(enc);
1686             const ptrdiff_t oldoffset = (ptrdiff_t)PTABLE_fetch(ref_seenhash, src);
1687             if (expect_false(oldoffset)) {
1688                 /* we have seen it before, so we do not need to bless it again */
1689                 if (ref_rewrite_pos) {
1690                     if (DEBUGHACK) warn("ref to %p as %"UVuf, src, (UV)oldoffset);
1691                     enc->buf.pos= enc->buf.body_pos + ref_rewrite_pos;
1692                     srl_buf_cat_varint(aTHX_ &enc->buf, SRL_HDR_REFP, (UV)oldoffset);
1693                 } else {
1694                     if (DEBUGHACK) warn("alias to %p as %"UVuf, src, (UV)oldoffset);
1695                     srl_buf_cat_varint(aTHX_ &enc->buf, SRL_HDR_ALIAS, (UV)oldoffset);
1696                 }
1697                 SRL_SET_TRACK_FLAG(*(enc->buf.body_pos + oldoffset));
1698                 --enc->recursion_depth;
1699                 return;
1700             }
1701             if (DEBUGHACK) warn("storing %p as %"UVuf, src, (UV)BODY_POS_OFS(&enc->buf));
1702             PTABLE_store(ref_seenhash, src, INT2PTR(void *, BODY_POS_OFS(&enc->buf)));
1703         }
1704     }
1705 
1706     if (expect_false( weakref_ofs != 0 )) {
1707         sv_dump(src);
1708         croak("Corrupted weakref? weakref_ofs should be 0, but got %"UVuf" (this should not happen)", weakref_ofs);
1709     }
1710 
1711     if (replacement) {
1712         if (SvROK(replacement))  {
1713             src= SvRV(replacement);
1714         } else {
1715             src= replacement;
1716         }
1717         replacement= NULL;
1718         svt = SvTYPE(src);
1719         /* plus one ensures that later on we get REFN/ARRAY and not ARRAYREF - This is horrible tho. needs to be revisited another day */
1720         refcount= SvREFCNT(src) + 1;
1721         /* We could, but do not do the following:*/
1722         /* goto redo_dump; */
1723         /* Probably a "proper" solution would, but there are nits there that I dont want to chase right now. */
1724     }
1725 
1726     /* --------------------------------- */
1727     _SRL_IF_SIMPLE_DIRECT_DUMP_SV(enc, src, svt)
1728     else
1729 #if defined(MODERN_REGEXP) && defined(REGEXP_NO_LONGER_POK)
1730     /* Only need to enter here if we have rather modern regexps AND they're
1731      * NO LONGER POK (5.17.6 and up). */
1732     if ( expect_false( svt == SVt_REGEXP ) ) {
1733         srl_dump_regexp(aTHX_ enc, src);
1734     }
1735     else
1736 #endif
1737     if (SvROK(src)) {
1738         /* dump references */
1739         SV *referent= SvRV(src);
1740 /* assert()-like hack to be compiled out by default */
1741 #ifndef NDEBUG
1742         if (!referent) {
1743             sv_dump(src);
1744             assert(referent);
1745         }
1746 #endif
1747         if (expect_false( SvWEAKREF(src) )) {
1748             if (DEBUGHACK) warn("Is weakref %p", src);
1749             weakref_ofs= BODY_POS_OFS(&enc->buf);
1750             srl_buf_cat_char(&enc->buf, SRL_HDR_WEAKEN);
1751         }
1752 
1753         ref_rewrite_pos= BODY_POS_OFS(&enc->buf);
1754 
1755         if ( expect_false( sv_isobject(src) ) ) {
1756             /* Write bless operator with class name */
1757             replacement= srl_get_frozen_object(aTHX_ enc, src, referent);
1758             if (srl_dump_classname(aTHX_ enc, referent, replacement)) {
1759                 /* 1 means we should not rewrite away the classname */
1760                 ref_rewrite_pos= BODY_POS_OFS(&enc->buf);
1761             }
1762         }
1763 
1764         srl_buf_cat_char(&enc->buf, SRL_HDR_REFN);
1765         refsv= src;
1766         src= referent;
1767 
1768         if (DEBUGHACK) warn("Going to redo %p", src);
1769         goto redo_dump;
1770     }
1771     else
1772 #ifndef MODERN_REGEXP
1773     if (
1774         svt == SVt_PVMG &&
1775         ((SvFLAGS(src) & (SVs_OBJECT|SVf_OK|SVs_GMG|SVs_SMG|SVs_RMG)) == (SVs_OBJECT|BFD_Svs_SMG_OR_RMG)) &&
1776         (mg = mg_find(src, PERL_MAGIC_qr))
1777     ) {
1778         /* Houston, we have a regex! */
1779         srl_dump_regexp(aTHX_ enc, (SV*)mg); /* yes the SV* cast makes me feel dirty too */
1780     }
1781     else
1782 #endif
1783     if (svt == SVt_PVHV) {
1784         srl_dump_hv(aTHX_ enc, (HV *)src, refcount);
1785     }
1786     else
1787     if (svt == SVt_PVAV) {
1788         srl_dump_av(aTHX_ enc, (AV *)src, refcount);
1789     }
1790     else
1791     if ( ! SvOK(src) ) { /* undef and weird shit */
1792         if ( SRL_UNSUPPORTED_SvTYPE(svt) ) {
1793             /* we exclude magic, because magic sv's can be undef too */
1794             /* called when we find an unsupported type/reference. May either throw exception
1795              * or write ONE (nested or single) item to the buffer. */
1796 #define SRL_HANDLE_UNSUPPORTED_SvTYPE(enc, src, svt, refsv, ref_rewrite_pos)                     \
1797             STMT_START {                                                                       \
1798                 if ( SRL_ENC_HAVE_OPTION((enc), SRL_F_UNDEF_UNKNOWN) ) {                       \
1799                     if (SRL_ENC_HAVE_OPTION((enc), SRL_F_WARN_UNKNOWN))                        \
1800                         warn("Found type %u %s(0x%p), but it is not representable "            \
1801                              "by the Sereal encoding format; will encode as an "               \
1802                              "undefined value", (svt), sv_reftype((src),0),(src));             \
1803                     if (ref_rewrite_pos) {                                                     \
1804                         /* make sure we don't keep a reference to the thing that we do not     \
1805                          * want to serialize around for REFP and ALIAS output */               \
1806                         PTABLE_t *ref_seenhash= SRL_GET_REF_SEENHASH(enc);                     \
1807                         PTABLE_delete(ref_seenhash, src);                                      \
1808                         enc->buf.pos= enc->buf.body_pos + ref_rewrite_pos;                     \
1809                     }                                                                          \
1810                     srl_buf_cat_char(&(enc)->buf, SRL_HDR_UNDEF);                              \
1811                 }                                                                              \
1812                 else if ( SRL_ENC_HAVE_OPTION((enc), SRL_F_STRINGIFY_UNKNOWN) ) {              \
1813                     STRLEN len;                                                                \
1814                     char *str;                                                                 \
1815                     if (SRL_ENC_HAVE_OPTION((enc), SRL_F_WARN_UNKNOWN)) {                      \
1816                         /* In theory, we need to warn about stringifying this unsupported      \
1817                          * item. However, if the SRL_F_NOWARN_UNKNOWN_OVERLOAD option is set,  \
1818                          * then we DO NOT warn about stringifying this unsupported item if     \
1819                          * it is an object with string overloading (assuming it's done on      \
1820                          * purpose to stringify in cases like these).                          \
1821                          */                                                                    \
1822                         if (!SRL_ENC_HAVE_OPTION((enc), SRL_F_NOWARN_UNKNOWN_OVERLOAD)         \
1823                              || !SvOBJECT(src)                                                 \
1824                              || !Gv_AMG(SvSTASH(src)))                                         \
1825                         {                                                                      \
1826                             warn("Found type %u %s(0x%p), but it is not representable "        \
1827                                  "by the Sereal encoding format; will encode as a "            \
1828                                  "stringified form", (svt), sv_reftype((src),0),(src));        \
1829                         }                                                                      \
1830                     }                                                                          \
1831                     if (ref_rewrite_pos) {                                                     \
1832                         /* make sure we don't keep a reference to the thing that we do not     \
1833                          * want to serialize around for REFP and ALIAS output */               \
1834                         PTABLE_t *ref_seenhash= SRL_GET_REF_SEENHASH(enc);                     \
1835                         PTABLE_delete(ref_seenhash, src);                                      \
1836                         enc->buf.pos= enc->buf.body_pos + ref_rewrite_pos;                     \
1837                         str = SvPV((refsv), len);                                              \
1838                     } else                                                                     \
1839                         str = SvPV((src), len);                                                \
1840                     srl_dump_pv(aTHX_ (enc), (str), len, SvUTF8(src));                         \
1841                 }                                                                              \
1842                 else {                                                                         \
1843                     croak("Found type %u %s(0x%p), but it is not representable "               \
1844                           "by the Sereal encoding format", (svt), sv_reftype((src),0),(src));  \
1845                 }                                                                              \
1846             } STMT_END
1847             SRL_HANDLE_UNSUPPORTED_SvTYPE(enc, src, svt, refsv, ref_rewrite_pos);
1848         }
1849         else if (src == &PL_sv_undef && enc->protocol_version >= 3 ) {
1850             srl_buf_cat_char(&enc->buf, SRL_HDR_CANONICAL_UNDEF);
1851         } else {
1852             srl_buf_cat_char(&enc->buf, SRL_HDR_UNDEF);
1853         }
1854     }
1855     else {
1856         SRL_HANDLE_UNSUPPORTED_SvTYPE(enc, src, svt, refsv, ref_rewrite_pos);
1857 #undef SRL_HANDLE_UNSUPPORTED_SvTYPE
1858     }
1859     --enc->recursion_depth;
1860 }
1861 
1862