1 /**********************************************************************
2
3 pack.c -
4
5 $Author: shyouhei $
6 created at: Thu Feb 10 15:17:05 JST 1994
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9
10 **********************************************************************/
11
12 #include "ruby/encoding.h"
13 #include "internal.h"
14 #include <sys/types.h>
15 #include <ctype.h>
16 #include <errno.h>
17 #include <float.h>
18
19 /*
20 * It is intentional that the condition for natstr is HAVE_TRUE_LONG_LONG
21 * instead of HAVE_LONG_LONG or LONG_LONG.
22 * This means q! and Q! means always the standard long long type and
23 * causes ArgumentError for platforms which has no long long type,
24 * even if the platform has an implementation specific 64bit type.
25 * This behavior is consistent with the document of pack/unpack.
26 */
27 #ifdef HAVE_TRUE_LONG_LONG
28 static const char natstr[] = "sSiIlLqQjJ";
29 #else
30 static const char natstr[] = "sSiIlLjJ";
31 #endif
32 static const char endstr[] = "sSiIlLqQjJ";
33
34 #ifdef HAVE_TRUE_LONG_LONG
35 /* It is intentional to use long long instead of LONG_LONG. */
36 # define NATINT_LEN_Q NATINT_LEN(long long, 8)
37 #else
38 # define NATINT_LEN_Q 8
39 #endif
40
41 #if SIZEOF_SHORT != 2 || SIZEOF_LONG != 4 || (defined(HAVE_TRUE_LONG_LONG) && SIZEOF_LONG_LONG != 8)
42 # define NATINT_PACK
43 #endif
44
45 #ifdef DYNAMIC_ENDIAN
46 /* for universal binary of NEXTSTEP and MacOS X */
47 /* useless since autoconf 2.63? */
48 static int
is_bigendian(void)49 is_bigendian(void)
50 {
51 static int init = 0;
52 static int endian_value;
53 char *p;
54
55 if (init) return endian_value;
56 init = 1;
57 p = (char*)&init;
58 return endian_value = p[0]?0:1;
59 }
60 # define BIGENDIAN_P() (is_bigendian())
61 #elif defined(WORDS_BIGENDIAN)
62 # define BIGENDIAN_P() 1
63 #else
64 # define BIGENDIAN_P() 0
65 #endif
66
67 #ifdef NATINT_PACK
68 # define NATINT_LEN(type,len) (natint?(int)sizeof(type):(int)(len))
69 #else
70 # define NATINT_LEN(type,len) ((int)sizeof(type))
71 #endif
72
73 typedef union {
74 float f;
75 uint32_t u;
76 char buf[4];
77 } FLOAT_SWAPPER;
78 typedef union {
79 double d;
80 uint64_t u;
81 char buf[8];
82 } DOUBLE_SWAPPER;
83 #define swapf(x) swap32(x)
84 #define swapd(x) swap64(x)
85
86 #define rb_ntohf(x) (BIGENDIAN_P()?(x):swapf(x))
87 #define rb_ntohd(x) (BIGENDIAN_P()?(x):swapd(x))
88 #define rb_htonf(x) (BIGENDIAN_P()?(x):swapf(x))
89 #define rb_htond(x) (BIGENDIAN_P()?(x):swapd(x))
90 #define rb_htovf(x) (BIGENDIAN_P()?swapf(x):(x))
91 #define rb_htovd(x) (BIGENDIAN_P()?swapd(x):(x))
92 #define rb_vtohf(x) (BIGENDIAN_P()?swapf(x):(x))
93 #define rb_vtohd(x) (BIGENDIAN_P()?swapd(x):(x))
94
95 #define FLOAT_CONVWITH(x) FLOAT_SWAPPER x;
96 #define HTONF(x) ((x).u = rb_htonf((x).u))
97 #define HTOVF(x) ((x).u = rb_htovf((x).u))
98 #define NTOHF(x) ((x).u = rb_ntohf((x).u))
99 #define VTOHF(x) ((x).u = rb_vtohf((x).u))
100
101 #define DOUBLE_CONVWITH(x) DOUBLE_SWAPPER x;
102 #define HTOND(x) ((x).u = rb_htond((x).u))
103 #define HTOVD(x) ((x).u = rb_htovd((x).u))
104 #define NTOHD(x) ((x).u = rb_ntohd((x).u))
105 #define VTOHD(x) ((x).u = rb_vtohd((x).u))
106
107 #define MAX_INTEGER_PACK_SIZE 8
108
109 static const char toofew[] = "too few arguments";
110
111 static void encodes(VALUE,const char*,long,int,int);
112 static void qpencode(VALUE,VALUE,long);
113
114 static unsigned long utf8_to_uv(const char*,long*);
115
116 static ID id_associated;
117
118 static void
str_associate(VALUE str,VALUE add)119 str_associate(VALUE str, VALUE add)
120 {
121 /* assert(NIL_P(rb_attr_get(str, id_associated))); */
122 rb_ivar_set(str, id_associated, add);
123 }
124
125 static VALUE
str_associated(VALUE str)126 str_associated(VALUE str)
127 {
128 return rb_ivar_lookup(str, id_associated, Qfalse);
129 }
130
131 static void
unknown_directive(const char * mode,char type,VALUE fmt)132 unknown_directive(const char *mode, char type, VALUE fmt)
133 {
134 VALUE f;
135 char unknown[5];
136
137 if (ISPRINT(type)) {
138 unknown[0] = type;
139 unknown[1] = '\0';
140 }
141 else {
142 snprintf(unknown, sizeof(unknown), "\\x%.2x", type & 0xff);
143 }
144 f = rb_str_quote_unprintable(fmt);
145 if (f != fmt) {
146 fmt = rb_str_subseq(f, 1, RSTRING_LEN(f) - 2);
147 }
148 rb_warning("unknown %s directive '%s' in '%"PRIsVALUE"'",
149 mode, unknown, fmt);
150 }
151
152 static float
VALUE_to_float(VALUE obj)153 VALUE_to_float(VALUE obj)
154 {
155 VALUE v = rb_to_float(obj);
156 double d = RFLOAT_VALUE(v);
157
158 if (isnan(d)) {
159 return NAN;
160 }
161 else if (d < -FLT_MAX) {
162 return -INFINITY;
163 }
164 else if (d <= FLT_MAX) {
165 return d;
166 }
167 else {
168 return INFINITY;
169 }
170 }
171
172 /*
173 * call-seq:
174 * arr.pack( aTemplateString ) -> aBinaryString
175 * arr.pack( aTemplateString, buffer: aBufferString ) -> aBufferString
176 *
177 * Packs the contents of <i>arr</i> into a binary sequence according to
178 * the directives in <i>aTemplateString</i> (see the table below)
179 * Directives ``A,'' ``a,'' and ``Z'' may be followed by a count,
180 * which gives the width of the resulting field. The remaining
181 * directives also may take a count, indicating the number of array
182 * elements to convert. If the count is an asterisk
183 * (``<code>*</code>''), all remaining array elements will be
184 * converted. Any of the directives ``<code>sSiIlL</code>'' may be
185 * followed by an underscore (``<code>_</code>'') or
186 * exclamation mark (``<code>!</code>'') to use the underlying
187 * platform's native size for the specified type; otherwise, they use a
188 * platform-independent size. Spaces are ignored in the template
189 * string. See also <code>String#unpack</code>.
190 *
191 * a = [ "a", "b", "c" ]
192 * n = [ 65, 66, 67 ]
193 * a.pack("A3A3A3") #=> "a b c "
194 * a.pack("a3a3a3") #=> "a\000\000b\000\000c\000\000"
195 * n.pack("ccc") #=> "ABC"
196 *
197 * If <i>aBufferString</i> is specified and its capacity is enough,
198 * +pack+ uses it as the buffer and returns it.
199 * When the offset is specified by the beginning of <i>aTemplateString</i>,
200 * the result is filled after the offset.
201 * If original contents of <i>aBufferString</i> exists and it's longer than
202 * the offset, the rest of <i>offsetOfBuffer</i> are overwritten by the result.
203 * If it's shorter, the gap is filled with ``<code>\0</code>''.
204 *
205 * Note that ``buffer:'' option does not guarantee not to allocate memory
206 * in +pack+. If the capacity of <i>aBufferString</i> is not enough,
207 * +pack+ allocates memory.
208 *
209 * Directives for +pack+.
210 *
211 * Integer | Array |
212 * Directive | Element | Meaning
213 * ----------------------------------------------------------------------------
214 * C | Integer | 8-bit unsigned (unsigned char)
215 * S | Integer | 16-bit unsigned, native endian (uint16_t)
216 * L | Integer | 32-bit unsigned, native endian (uint32_t)
217 * Q | Integer | 64-bit unsigned, native endian (uint64_t)
218 * J | Integer | pointer width unsigned, native endian (uintptr_t)
219 * | | (J is available since Ruby 2.3.)
220 * | |
221 * c | Integer | 8-bit signed (signed char)
222 * s | Integer | 16-bit signed, native endian (int16_t)
223 * l | Integer | 32-bit signed, native endian (int32_t)
224 * q | Integer | 64-bit signed, native endian (int64_t)
225 * j | Integer | pointer width signed, native endian (intptr_t)
226 * | | (j is available since Ruby 2.3.)
227 * | |
228 * S_ S! | Integer | unsigned short, native endian
229 * I I_ I! | Integer | unsigned int, native endian
230 * L_ L! | Integer | unsigned long, native endian
231 * Q_ Q! | Integer | unsigned long long, native endian (ArgumentError
232 * | | if the platform has no long long type.)
233 * | | (Q_ and Q! is available since Ruby 2.1.)
234 * J! | Integer | uintptr_t, native endian (same with J)
235 * | | (J! is available since Ruby 2.3.)
236 * | |
237 * s_ s! | Integer | signed short, native endian
238 * i i_ i! | Integer | signed int, native endian
239 * l_ l! | Integer | signed long, native endian
240 * q_ q! | Integer | signed long long, native endian (ArgumentError
241 * | | if the platform has no long long type.)
242 * | | (q_ and q! is available since Ruby 2.1.)
243 * j! | Integer | intptr_t, native endian (same with j)
244 * | | (j! is available since Ruby 2.3.)
245 * | |
246 * S> s> S!> s!> | Integer | same as the directives without ">" except
247 * L> l> L!> l!> | | big endian
248 * I!> i!> | | (available since Ruby 1.9.3)
249 * Q> q> Q!> q!> | | "S>" is same as "n"
250 * J> j> J!> j!> | | "L>" is same as "N"
251 * | |
252 * S< s< S!< s!< | Integer | same as the directives without "<" except
253 * L< l< L!< l!< | | little endian
254 * I!< i!< | | (available since Ruby 1.9.3)
255 * Q< q< Q!< q!< | | "S<" is same as "v"
256 * J< j< J!< j!< | | "L<" is same as "V"
257 * | |
258 * n | Integer | 16-bit unsigned, network (big-endian) byte order
259 * N | Integer | 32-bit unsigned, network (big-endian) byte order
260 * v | Integer | 16-bit unsigned, VAX (little-endian) byte order
261 * V | Integer | 32-bit unsigned, VAX (little-endian) byte order
262 * | |
263 * U | Integer | UTF-8 character
264 * w | Integer | BER-compressed integer
265 *
266 * Float | Array |
267 * Directive | Element | Meaning
268 * ---------------------------------------------------------------------------
269 * D d | Float | double-precision, native format
270 * F f | Float | single-precision, native format
271 * E | Float | double-precision, little-endian byte order
272 * e | Float | single-precision, little-endian byte order
273 * G | Float | double-precision, network (big-endian) byte order
274 * g | Float | single-precision, network (big-endian) byte order
275 *
276 * String | Array |
277 * Directive | Element | Meaning
278 * ---------------------------------------------------------------------------
279 * A | String | arbitrary binary string (space padded, count is width)
280 * a | String | arbitrary binary string (null padded, count is width)
281 * Z | String | same as ``a'', except that null is added with *
282 * B | String | bit string (MSB first)
283 * b | String | bit string (LSB first)
284 * H | String | hex string (high nibble first)
285 * h | String | hex string (low nibble first)
286 * u | String | UU-encoded string
287 * M | String | quoted printable, MIME encoding (see also RFC2045)
288 * | | (text mode but input must use LF and output LF)
289 * m | String | base64 encoded string (see RFC 2045, count is width)
290 * | | (if count is 0, no line feed are added, see RFC 4648)
291 * P | String | pointer to a structure (fixed-length string)
292 * p | String | pointer to a null-terminated string
293 *
294 * Misc. | Array |
295 * Directive | Element | Meaning
296 * ---------------------------------------------------------------------------
297 * @ | --- | moves to absolute position
298 * X | --- | back up a byte
299 * x | --- | null byte
300 */
301
302 static VALUE
pack_pack(int argc,VALUE * argv,VALUE ary)303 pack_pack(int argc, VALUE *argv, VALUE ary)
304 {
305 static const char nul10[] = "\0\0\0\0\0\0\0\0\0\0";
306 static const char spc10[] = " ";
307 const char *p, *pend;
308 VALUE fmt, opt = Qnil, res, from, associates = 0, buffer = 0;
309 char type;
310 long len, idx, plen;
311 const char *ptr;
312 int enc_info = 1; /* 0 - BINARY, 1 - US-ASCII, 2 - UTF-8 */
313 #ifdef NATINT_PACK
314 int natint; /* native integer */
315 #endif
316 int integer_size, bigendian_p;
317
318 rb_scan_args(argc, argv, "10:", &fmt, &opt);
319
320 StringValue(fmt);
321 p = RSTRING_PTR(fmt);
322 pend = p + RSTRING_LEN(fmt);
323 if (!NIL_P(opt)) {
324 static ID keyword_ids[1];
325 if (!keyword_ids[0])
326 CONST_ID(keyword_ids[0], "buffer");
327
328 rb_get_kwargs(opt, keyword_ids, 0, 1, &buffer);
329
330 if (buffer != Qundef && !RB_TYPE_P(buffer, T_STRING))
331 rb_raise(rb_eTypeError, "buffer must be String, not %s", rb_obj_classname(buffer));
332 }
333 if (buffer)
334 res = buffer;
335 else
336 res = rb_str_buf_new(0);
337
338 idx = 0;
339
340 #define TOO_FEW (rb_raise(rb_eArgError, toofew), 0)
341 #define MORE_ITEM (idx < RARRAY_LEN(ary))
342 #define THISFROM (MORE_ITEM ? RARRAY_AREF(ary, idx) : TOO_FEW)
343 #define NEXTFROM (MORE_ITEM ? RARRAY_AREF(ary, idx++) : TOO_FEW)
344
345 while (p < pend) {
346 int explicit_endian = 0;
347 if (RSTRING_PTR(fmt) + RSTRING_LEN(fmt) != pend) {
348 rb_raise(rb_eRuntimeError, "format string modified");
349 }
350 type = *p++; /* get data type */
351 #ifdef NATINT_PACK
352 natint = 0;
353 #endif
354
355 if (ISSPACE(type)) continue;
356 if (type == '#') {
357 while ((p < pend) && (*p != '\n')) {
358 p++;
359 }
360 continue;
361 }
362
363 {
364 modifiers:
365 switch (*p) {
366 case '_':
367 case '!':
368 if (strchr(natstr, type)) {
369 #ifdef NATINT_PACK
370 natint = 1;
371 #endif
372 p++;
373 }
374 else {
375 rb_raise(rb_eArgError, "'%c' allowed only after types %s", *p, natstr);
376 }
377 goto modifiers;
378
379 case '<':
380 case '>':
381 if (!strchr(endstr, type)) {
382 rb_raise(rb_eArgError, "'%c' allowed only after types %s", *p, endstr);
383 }
384 if (explicit_endian) {
385 rb_raise(rb_eRangeError, "Can't use both '<' and '>'");
386 }
387 explicit_endian = *p++;
388 goto modifiers;
389 }
390 }
391
392 if (*p == '*') { /* set data length */
393 len = strchr("@Xxu", type) ? 0
394 : strchr("PMm", type) ? 1
395 : RARRAY_LEN(ary) - idx;
396 p++;
397 }
398 else if (ISDIGIT(*p)) {
399 errno = 0;
400 len = STRTOUL(p, (char**)&p, 10);
401 if (errno) {
402 rb_raise(rb_eRangeError, "pack length too big");
403 }
404 }
405 else {
406 len = 1;
407 }
408
409 switch (type) {
410 case 'U':
411 /* if encoding is US-ASCII, upgrade to UTF-8 */
412 if (enc_info == 1) enc_info = 2;
413 break;
414 case 'm': case 'M': case 'u':
415 /* keep US-ASCII (do nothing) */
416 break;
417 default:
418 /* fall back to BINARY */
419 enc_info = 0;
420 break;
421 }
422 switch (type) {
423 case 'A': case 'a': case 'Z':
424 case 'B': case 'b':
425 case 'H': case 'h':
426 from = NEXTFROM;
427 if (NIL_P(from)) {
428 ptr = "";
429 plen = 0;
430 }
431 else {
432 StringValue(from);
433 ptr = RSTRING_PTR(from);
434 plen = RSTRING_LEN(from);
435 OBJ_INFECT(res, from);
436 }
437
438 if (p[-1] == '*')
439 len = plen;
440
441 switch (type) {
442 case 'a': /* arbitrary binary string (null padded) */
443 case 'A': /* arbitrary binary string (ASCII space padded) */
444 case 'Z': /* null terminated string */
445 if (plen >= len) {
446 rb_str_buf_cat(res, ptr, len);
447 if (p[-1] == '*' && type == 'Z')
448 rb_str_buf_cat(res, nul10, 1);
449 }
450 else {
451 rb_str_buf_cat(res, ptr, plen);
452 len -= plen;
453 while (len >= 10) {
454 rb_str_buf_cat(res, (type == 'A')?spc10:nul10, 10);
455 len -= 10;
456 }
457 rb_str_buf_cat(res, (type == 'A')?spc10:nul10, len);
458 }
459 break;
460
461 #define castchar(from) (char)((from) & 0xff)
462
463 case 'b': /* bit string (ascending) */
464 {
465 int byte = 0;
466 long i, j = 0;
467
468 if (len > plen) {
469 j = (len - plen + 1)/2;
470 len = plen;
471 }
472 for (i=0; i++ < len; ptr++) {
473 if (*ptr & 1)
474 byte |= 128;
475 if (i & 7)
476 byte >>= 1;
477 else {
478 char c = castchar(byte);
479 rb_str_buf_cat(res, &c, 1);
480 byte = 0;
481 }
482 }
483 if (len & 7) {
484 char c;
485 byte >>= 7 - (len & 7);
486 c = castchar(byte);
487 rb_str_buf_cat(res, &c, 1);
488 }
489 len = j;
490 goto grow;
491 }
492 break;
493
494 case 'B': /* bit string (descending) */
495 {
496 int byte = 0;
497 long i, j = 0;
498
499 if (len > plen) {
500 j = (len - plen + 1)/2;
501 len = plen;
502 }
503 for (i=0; i++ < len; ptr++) {
504 byte |= *ptr & 1;
505 if (i & 7)
506 byte <<= 1;
507 else {
508 char c = castchar(byte);
509 rb_str_buf_cat(res, &c, 1);
510 byte = 0;
511 }
512 }
513 if (len & 7) {
514 char c;
515 byte <<= 7 - (len & 7);
516 c = castchar(byte);
517 rb_str_buf_cat(res, &c, 1);
518 }
519 len = j;
520 goto grow;
521 }
522 break;
523
524 case 'h': /* hex string (low nibble first) */
525 {
526 int byte = 0;
527 long i, j = 0;
528
529 if (len > plen) {
530 j = (len + 1) / 2 - (plen + 1) / 2;
531 len = plen;
532 }
533 for (i=0; i++ < len; ptr++) {
534 if (ISALPHA(*ptr))
535 byte |= (((*ptr & 15) + 9) & 15) << 4;
536 else
537 byte |= (*ptr & 15) << 4;
538 if (i & 1)
539 byte >>= 4;
540 else {
541 char c = castchar(byte);
542 rb_str_buf_cat(res, &c, 1);
543 byte = 0;
544 }
545 }
546 if (len & 1) {
547 char c = castchar(byte);
548 rb_str_buf_cat(res, &c, 1);
549 }
550 len = j;
551 goto grow;
552 }
553 break;
554
555 case 'H': /* hex string (high nibble first) */
556 {
557 int byte = 0;
558 long i, j = 0;
559
560 if (len > plen) {
561 j = (len + 1) / 2 - (plen + 1) / 2;
562 len = plen;
563 }
564 for (i=0; i++ < len; ptr++) {
565 if (ISALPHA(*ptr))
566 byte |= ((*ptr & 15) + 9) & 15;
567 else
568 byte |= *ptr & 15;
569 if (i & 1)
570 byte <<= 4;
571 else {
572 char c = castchar(byte);
573 rb_str_buf_cat(res, &c, 1);
574 byte = 0;
575 }
576 }
577 if (len & 1) {
578 char c = castchar(byte);
579 rb_str_buf_cat(res, &c, 1);
580 }
581 len = j;
582 goto grow;
583 }
584 break;
585 }
586 break;
587
588 case 'c': /* signed char */
589 case 'C': /* unsigned char */
590 integer_size = 1;
591 bigendian_p = BIGENDIAN_P(); /* not effective */
592 goto pack_integer;
593
594 case 's': /* s for int16_t, s! for signed short */
595 integer_size = NATINT_LEN(short, 2);
596 bigendian_p = BIGENDIAN_P();
597 goto pack_integer;
598
599 case 'S': /* S for uint16_t, S! for unsigned short */
600 integer_size = NATINT_LEN(short, 2);
601 bigendian_p = BIGENDIAN_P();
602 goto pack_integer;
603
604 case 'i': /* i and i! for signed int */
605 integer_size = (int)sizeof(int);
606 bigendian_p = BIGENDIAN_P();
607 goto pack_integer;
608
609 case 'I': /* I and I! for unsigned int */
610 integer_size = (int)sizeof(int);
611 bigendian_p = BIGENDIAN_P();
612 goto pack_integer;
613
614 case 'l': /* l for int32_t, l! for signed long */
615 integer_size = NATINT_LEN(long, 4);
616 bigendian_p = BIGENDIAN_P();
617 goto pack_integer;
618
619 case 'L': /* L for uint32_t, L! for unsigned long */
620 integer_size = NATINT_LEN(long, 4);
621 bigendian_p = BIGENDIAN_P();
622 goto pack_integer;
623
624 case 'q': /* q for int64_t, q! for signed long long */
625 integer_size = NATINT_LEN_Q;
626 bigendian_p = BIGENDIAN_P();
627 goto pack_integer;
628
629 case 'Q': /* Q for uint64_t, Q! for unsigned long long */
630 integer_size = NATINT_LEN_Q;
631 bigendian_p = BIGENDIAN_P();
632 goto pack_integer;
633
634 case 'j': /* j for intptr_t */
635 integer_size = sizeof(intptr_t);
636 bigendian_p = BIGENDIAN_P();
637 goto pack_integer;
638
639 case 'J': /* J for uintptr_t */
640 integer_size = sizeof(uintptr_t);
641 bigendian_p = BIGENDIAN_P();
642 goto pack_integer;
643
644 case 'n': /* 16 bit (2 bytes) integer (network byte-order) */
645 integer_size = 2;
646 bigendian_p = 1;
647 goto pack_integer;
648
649 case 'N': /* 32 bit (4 bytes) integer (network byte-order) */
650 integer_size = 4;
651 bigendian_p = 1;
652 goto pack_integer;
653
654 case 'v': /* 16 bit (2 bytes) integer (VAX byte-order) */
655 integer_size = 2;
656 bigendian_p = 0;
657 goto pack_integer;
658
659 case 'V': /* 32 bit (4 bytes) integer (VAX byte-order) */
660 integer_size = 4;
661 bigendian_p = 0;
662 goto pack_integer;
663
664 pack_integer:
665 if (explicit_endian) {
666 bigendian_p = explicit_endian == '>';
667 }
668 if (integer_size > MAX_INTEGER_PACK_SIZE)
669 rb_bug("unexpected intger size for pack: %d", integer_size);
670 while (len-- > 0) {
671 char intbuf[MAX_INTEGER_PACK_SIZE];
672
673 from = NEXTFROM;
674 rb_integer_pack(from, intbuf, integer_size, 1, 0,
675 INTEGER_PACK_2COMP |
676 (bigendian_p ? INTEGER_PACK_BIG_ENDIAN : INTEGER_PACK_LITTLE_ENDIAN));
677 rb_str_buf_cat(res, intbuf, integer_size);
678 }
679 break;
680
681 case 'f': /* single precision float in native format */
682 case 'F': /* ditto */
683 while (len-- > 0) {
684 float f;
685
686 from = NEXTFROM;
687 f = VALUE_to_float(from);
688 rb_str_buf_cat(res, (char*)&f, sizeof(float));
689 }
690 break;
691
692 case 'e': /* single precision float in VAX byte-order */
693 while (len-- > 0) {
694 FLOAT_CONVWITH(tmp);
695
696 from = NEXTFROM;
697 tmp.f = VALUE_to_float(from);
698 HTOVF(tmp);
699 rb_str_buf_cat(res, tmp.buf, sizeof(float));
700 }
701 break;
702
703 case 'E': /* double precision float in VAX byte-order */
704 while (len-- > 0) {
705 DOUBLE_CONVWITH(tmp);
706 from = NEXTFROM;
707 tmp.d = RFLOAT_VALUE(rb_to_float(from));
708 HTOVD(tmp);
709 rb_str_buf_cat(res, tmp.buf, sizeof(double));
710 }
711 break;
712
713 case 'd': /* double precision float in native format */
714 case 'D': /* ditto */
715 while (len-- > 0) {
716 double d;
717
718 from = NEXTFROM;
719 d = RFLOAT_VALUE(rb_to_float(from));
720 rb_str_buf_cat(res, (char*)&d, sizeof(double));
721 }
722 break;
723
724 case 'g': /* single precision float in network byte-order */
725 while (len-- > 0) {
726 FLOAT_CONVWITH(tmp);
727 from = NEXTFROM;
728 tmp.f = VALUE_to_float(from);
729 HTONF(tmp);
730 rb_str_buf_cat(res, tmp.buf, sizeof(float));
731 }
732 break;
733
734 case 'G': /* double precision float in network byte-order */
735 while (len-- > 0) {
736 DOUBLE_CONVWITH(tmp);
737
738 from = NEXTFROM;
739 tmp.d = RFLOAT_VALUE(rb_to_float(from));
740 HTOND(tmp);
741 rb_str_buf_cat(res, tmp.buf, sizeof(double));
742 }
743 break;
744
745 case 'x': /* null byte */
746 grow:
747 while (len >= 10) {
748 rb_str_buf_cat(res, nul10, 10);
749 len -= 10;
750 }
751 rb_str_buf_cat(res, nul10, len);
752 break;
753
754 case 'X': /* back up byte */
755 shrink:
756 plen = RSTRING_LEN(res);
757 if (plen < len)
758 rb_raise(rb_eArgError, "X outside of string");
759 rb_str_set_len(res, plen - len);
760 break;
761
762 case '@': /* null fill to absolute position */
763 len -= RSTRING_LEN(res);
764 if (len > 0) goto grow;
765 len = -len;
766 if (len > 0) goto shrink;
767 break;
768
769 case '%':
770 rb_raise(rb_eArgError, "%% is not supported");
771 break;
772
773 case 'U': /* Unicode character */
774 while (len-- > 0) {
775 SIGNED_VALUE l;
776 char buf[8];
777 int le;
778
779 from = NEXTFROM;
780 from = rb_to_int(from);
781 l = NUM2LONG(from);
782 if (l < 0) {
783 rb_raise(rb_eRangeError, "pack(U): value out of range");
784 }
785 le = rb_uv_to_utf8(buf, l);
786 rb_str_buf_cat(res, (char*)buf, le);
787 }
788 break;
789
790 case 'u': /* uuencoded string */
791 case 'm': /* base64 encoded string */
792 from = NEXTFROM;
793 StringValue(from);
794 ptr = RSTRING_PTR(from);
795 plen = RSTRING_LEN(from);
796 OBJ_INFECT(res, from);
797
798 if (len == 0 && type == 'm') {
799 encodes(res, ptr, plen, type, 0);
800 ptr += plen;
801 break;
802 }
803 if (len <= 2)
804 len = 45;
805 else if (len > 63 && type == 'u')
806 len = 63;
807 else
808 len = len / 3 * 3;
809 while (plen > 0) {
810 long todo;
811
812 if (plen > len)
813 todo = len;
814 else
815 todo = plen;
816 encodes(res, ptr, todo, type, 1);
817 plen -= todo;
818 ptr += todo;
819 }
820 break;
821
822 case 'M': /* quoted-printable encoded string */
823 from = rb_obj_as_string(NEXTFROM);
824 OBJ_INFECT(res, from);
825 if (len <= 1)
826 len = 72;
827 qpencode(res, from, len);
828 break;
829
830 case 'P': /* pointer to packed byte string */
831 from = THISFROM;
832 if (!NIL_P(from)) {
833 StringValue(from);
834 if (RSTRING_LEN(from) < len) {
835 rb_raise(rb_eArgError, "too short buffer for P(%ld for %ld)",
836 RSTRING_LEN(from), len);
837 }
838 }
839 len = 1;
840 /* FALL THROUGH */
841 case 'p': /* pointer to string */
842 while (len-- > 0) {
843 char *t;
844 from = NEXTFROM;
845 if (NIL_P(from)) {
846 t = 0;
847 }
848 else {
849 t = StringValuePtr(from);
850 OBJ_INFECT(res, from);
851 rb_obj_taint(from);
852 }
853 if (!associates) {
854 associates = rb_ary_new();
855 }
856 rb_ary_push(associates, from);
857 rb_str_buf_cat(res, (char*)&t, sizeof(char*));
858 }
859 break;
860
861 case 'w': /* BER compressed integer */
862 while (len-- > 0) {
863 VALUE buf = rb_str_new(0, 0);
864 size_t numbytes;
865 int sign;
866 char *cp;
867
868 from = NEXTFROM;
869 from = rb_to_int(from);
870 numbytes = rb_absint_numwords(from, 7, NULL);
871 if (numbytes == 0)
872 numbytes = 1;
873 buf = rb_str_new(NULL, numbytes);
874
875 sign = rb_integer_pack(from, RSTRING_PTR(buf), RSTRING_LEN(buf), 1, 1, INTEGER_PACK_BIG_ENDIAN);
876
877 if (sign < 0)
878 rb_raise(rb_eArgError, "can't compress negative numbers");
879 if (sign == 2)
880 rb_bug("buffer size problem?");
881
882 cp = RSTRING_PTR(buf);
883 while (1 < numbytes) {
884 *cp |= 0x80;
885 cp++;
886 numbytes--;
887 }
888
889 rb_str_buf_cat(res, RSTRING_PTR(buf), RSTRING_LEN(buf));
890 }
891 break;
892
893 default: {
894 unknown_directive("pack", type, fmt);
895 break;
896 }
897 }
898 }
899
900 if (associates) {
901 str_associate(res, associates);
902 }
903 OBJ_INFECT(res, fmt);
904 switch (enc_info) {
905 case 1:
906 ENCODING_CODERANGE_SET(res, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
907 break;
908 case 2:
909 rb_enc_set_index(res, rb_utf8_encindex());
910 break;
911 default:
912 /* do nothing, keep ASCII-8BIT */
913 break;
914 }
915 return res;
916 }
917
918 static const char uu_table[] =
919 "`!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_";
920 static const char b64_table[] =
921 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
922
923 static void
encodes(VALUE str,const char * s0,long len,int type,int tail_lf)924 encodes(VALUE str, const char *s0, long len, int type, int tail_lf)
925 {
926 enum {buff_size = 4096, encoded_unit = 4, input_unit = 3};
927 char buff[buff_size + 1]; /* +1 for tail_lf */
928 long i = 0;
929 const char *const trans = type == 'u' ? uu_table : b64_table;
930 char padding;
931 const unsigned char *s = (const unsigned char *)s0;
932
933 if (type == 'u') {
934 buff[i++] = (char)len + ' ';
935 padding = '`';
936 }
937 else {
938 padding = '=';
939 }
940 while (len >= input_unit) {
941 while (len >= input_unit && buff_size-i >= encoded_unit) {
942 buff[i++] = trans[077 & (*s >> 2)];
943 buff[i++] = trans[077 & (((*s << 4) & 060) | ((s[1] >> 4) & 017))];
944 buff[i++] = trans[077 & (((s[1] << 2) & 074) | ((s[2] >> 6) & 03))];
945 buff[i++] = trans[077 & s[2]];
946 s += input_unit;
947 len -= input_unit;
948 }
949 if (buff_size-i < encoded_unit) {
950 rb_str_buf_cat(str, buff, i);
951 i = 0;
952 }
953 }
954
955 if (len == 2) {
956 buff[i++] = trans[077 & (*s >> 2)];
957 buff[i++] = trans[077 & (((*s << 4) & 060) | ((s[1] >> 4) & 017))];
958 buff[i++] = trans[077 & (((s[1] << 2) & 074) | (('\0' >> 6) & 03))];
959 buff[i++] = padding;
960 }
961 else if (len == 1) {
962 buff[i++] = trans[077 & (*s >> 2)];
963 buff[i++] = trans[077 & (((*s << 4) & 060) | (('\0' >> 4) & 017))];
964 buff[i++] = padding;
965 buff[i++] = padding;
966 }
967 if (tail_lf) buff[i++] = '\n';
968 rb_str_buf_cat(str, buff, i);
969 if ((size_t)i > sizeof(buff)) rb_bug("encodes() buffer overrun");
970 }
971
972 static const char hex_table[] = "0123456789ABCDEF";
973
974 static void
qpencode(VALUE str,VALUE from,long len)975 qpencode(VALUE str, VALUE from, long len)
976 {
977 char buff[1024];
978 long i = 0, n = 0, prev = EOF;
979 unsigned char *s = (unsigned char*)RSTRING_PTR(from);
980 unsigned char *send = s + RSTRING_LEN(from);
981
982 while (s < send) {
983 if ((*s > 126) ||
984 (*s < 32 && *s != '\n' && *s != '\t') ||
985 (*s == '=')) {
986 buff[i++] = '=';
987 buff[i++] = hex_table[*s >> 4];
988 buff[i++] = hex_table[*s & 0x0f];
989 n += 3;
990 prev = EOF;
991 }
992 else if (*s == '\n') {
993 if (prev == ' ' || prev == '\t') {
994 buff[i++] = '=';
995 buff[i++] = *s;
996 }
997 buff[i++] = *s;
998 n = 0;
999 prev = *s;
1000 }
1001 else {
1002 buff[i++] = *s;
1003 n++;
1004 prev = *s;
1005 }
1006 if (n > len) {
1007 buff[i++] = '=';
1008 buff[i++] = '\n';
1009 n = 0;
1010 prev = '\n';
1011 }
1012 if (i > 1024 - 5) {
1013 rb_str_buf_cat(str, buff, i);
1014 i = 0;
1015 }
1016 s++;
1017 }
1018 if (n > 0) {
1019 buff[i++] = '=';
1020 buff[i++] = '\n';
1021 }
1022 if (i > 0) {
1023 rb_str_buf_cat(str, buff, i);
1024 }
1025 }
1026
1027 static inline int
hex2num(char c)1028 hex2num(char c)
1029 {
1030 int n;
1031 n = ruby_digit36_to_number_table[(unsigned char)c];
1032 if (16 <= n)
1033 n = -1;
1034 return n;
1035 }
1036
1037 #define PACK_LENGTH_ADJUST_SIZE(sz) do { \
1038 tmp_len = 0; \
1039 if (len > (long)((send-s)/(sz))) { \
1040 if (!star) { \
1041 tmp_len = len-(send-s)/(sz); \
1042 } \
1043 len = (send-s)/(sz); \
1044 } \
1045 } while (0)
1046
1047 #define PACK_ITEM_ADJUST() do { \
1048 if (tmp_len > 0 && mode == UNPACK_ARRAY) \
1049 rb_ary_store(ary, RARRAY_LEN(ary)+tmp_len-1, Qnil); \
1050 } while (0)
1051
1052 /* Workaround for Oracle Developer Studio (Oracle Solaris Studio)
1053 * 12.4/12.5/12.6 C compiler optimization bug
1054 * with "-xO4" optimization option.
1055 */
1056 #if defined(__SUNPRO_C) && 0x5130 <= __SUNPRO_C && __SUNPRO_C <= 0x5150
1057 # define AVOID_CC_BUG volatile
1058 #else
1059 # define AVOID_CC_BUG
1060 #endif
1061
1062 static VALUE
infected_str_new(const char * ptr,long len,VALUE str)1063 infected_str_new(const char *ptr, long len, VALUE str)
1064 {
1065 VALUE s = rb_str_new(ptr, len);
1066
1067 OBJ_INFECT(s, str);
1068 return s;
1069 }
1070
1071 /* unpack mode */
1072 #define UNPACK_ARRAY 0
1073 #define UNPACK_BLOCK 1
1074 #define UNPACK_1 2
1075
1076 static VALUE
pack_unpack_internal(VALUE str,VALUE fmt,int mode)1077 pack_unpack_internal(VALUE str, VALUE fmt, int mode)
1078 {
1079 #define hexdigits ruby_hexdigits
1080 char *s, *send;
1081 char *p, *pend;
1082 VALUE ary;
1083 char type;
1084 long len;
1085 AVOID_CC_BUG long tmp_len;
1086 int star;
1087 #ifdef NATINT_PACK
1088 int natint; /* native integer */
1089 #endif
1090 int signed_p, integer_size, bigendian_p;
1091 #define UNPACK_PUSH(item) do {\
1092 VALUE item_val = (item);\
1093 if ((mode) == UNPACK_BLOCK) {\
1094 rb_yield(item_val);\
1095 }\
1096 else if ((mode) == UNPACK_ARRAY) {\
1097 rb_ary_push(ary, item_val);\
1098 }\
1099 else /* if ((mode) == UNPACK_1) { */ {\
1100 return item_val; \
1101 }\
1102 } while (0)
1103
1104 StringValue(str);
1105 StringValue(fmt);
1106 s = RSTRING_PTR(str);
1107 send = s + RSTRING_LEN(str);
1108 p = RSTRING_PTR(fmt);
1109 pend = p + RSTRING_LEN(fmt);
1110
1111 ary = mode == UNPACK_ARRAY ? rb_ary_new() : Qnil;
1112 while (p < pend) {
1113 int explicit_endian = 0;
1114 type = *p++;
1115 #ifdef NATINT_PACK
1116 natint = 0;
1117 #endif
1118
1119 if (ISSPACE(type)) continue;
1120 if (type == '#') {
1121 while ((p < pend) && (*p != '\n')) {
1122 p++;
1123 }
1124 continue;
1125 }
1126
1127 star = 0;
1128 {
1129 modifiers:
1130 switch (*p) {
1131 case '_':
1132 case '!':
1133
1134 if (strchr(natstr, type)) {
1135 #ifdef NATINT_PACK
1136 natint = 1;
1137 #endif
1138 p++;
1139 }
1140 else {
1141 rb_raise(rb_eArgError, "'%c' allowed only after types %s", *p, natstr);
1142 }
1143 goto modifiers;
1144
1145 case '<':
1146 case '>':
1147 if (!strchr(endstr, type)) {
1148 rb_raise(rb_eArgError, "'%c' allowed only after types %s", *p, endstr);
1149 }
1150 if (explicit_endian) {
1151 rb_raise(rb_eRangeError, "Can't use both '<' and '>'");
1152 }
1153 explicit_endian = *p++;
1154 goto modifiers;
1155 }
1156 }
1157
1158 if (p >= pend)
1159 len = 1;
1160 else if (*p == '*') {
1161 star = 1;
1162 len = send - s;
1163 p++;
1164 }
1165 else if (ISDIGIT(*p)) {
1166 errno = 0;
1167 len = STRTOUL(p, (char**)&p, 10);
1168 if (len < 0 || errno) {
1169 rb_raise(rb_eRangeError, "pack length too big");
1170 }
1171 }
1172 else {
1173 len = (type != '@');
1174 }
1175
1176 switch (type) {
1177 case '%':
1178 rb_raise(rb_eArgError, "%% is not supported");
1179 break;
1180
1181 case 'A':
1182 if (len > send - s) len = send - s;
1183 {
1184 long end = len;
1185 char *t = s + len - 1;
1186
1187 while (t >= s) {
1188 if (*t != ' ' && *t != '\0') break;
1189 t--; len--;
1190 }
1191 UNPACK_PUSH(infected_str_new(s, len, str));
1192 s += end;
1193 }
1194 break;
1195
1196 case 'Z':
1197 {
1198 char *t = s;
1199
1200 if (len > send-s) len = send-s;
1201 while (t < s+len && *t) t++;
1202 UNPACK_PUSH(infected_str_new(s, t-s, str));
1203 if (t < send) t++;
1204 s = star ? t : s+len;
1205 }
1206 break;
1207
1208 case 'a':
1209 if (len > send - s) len = send - s;
1210 UNPACK_PUSH(infected_str_new(s, len, str));
1211 s += len;
1212 break;
1213
1214 case 'b':
1215 {
1216 VALUE bitstr;
1217 char *t;
1218 int bits;
1219 long i;
1220
1221 if (p[-1] == '*' || len > (send - s) * 8)
1222 len = (send - s) * 8;
1223 bits = 0;
1224 bitstr = rb_usascii_str_new(0, len);
1225 OBJ_INFECT(bitstr, str);
1226 t = RSTRING_PTR(bitstr);
1227 for (i=0; i<len; i++) {
1228 if (i & 7) bits >>= 1;
1229 else bits = (unsigned char)*s++;
1230 *t++ = (bits & 1) ? '1' : '0';
1231 }
1232 UNPACK_PUSH(bitstr);
1233 }
1234 break;
1235
1236 case 'B':
1237 {
1238 VALUE bitstr;
1239 char *t;
1240 int bits;
1241 long i;
1242
1243 if (p[-1] == '*' || len > (send - s) * 8)
1244 len = (send - s) * 8;
1245 bits = 0;
1246 bitstr = rb_usascii_str_new(0, len);
1247 OBJ_INFECT(bitstr, str);
1248 t = RSTRING_PTR(bitstr);
1249 for (i=0; i<len; i++) {
1250 if (i & 7) bits <<= 1;
1251 else bits = (unsigned char)*s++;
1252 *t++ = (bits & 128) ? '1' : '0';
1253 }
1254 UNPACK_PUSH(bitstr);
1255 }
1256 break;
1257
1258 case 'h':
1259 {
1260 VALUE bitstr;
1261 char *t;
1262 int bits;
1263 long i;
1264
1265 if (p[-1] == '*' || len > (send - s) * 2)
1266 len = (send - s) * 2;
1267 bits = 0;
1268 bitstr = rb_usascii_str_new(0, len);
1269 OBJ_INFECT(bitstr, str);
1270 t = RSTRING_PTR(bitstr);
1271 for (i=0; i<len; i++) {
1272 if (i & 1)
1273 bits >>= 4;
1274 else
1275 bits = (unsigned char)*s++;
1276 *t++ = hexdigits[bits & 15];
1277 }
1278 UNPACK_PUSH(bitstr);
1279 }
1280 break;
1281
1282 case 'H':
1283 {
1284 VALUE bitstr;
1285 char *t;
1286 int bits;
1287 long i;
1288
1289 if (p[-1] == '*' || len > (send - s) * 2)
1290 len = (send - s) * 2;
1291 bits = 0;
1292 bitstr = rb_usascii_str_new(0, len);
1293 OBJ_INFECT(bitstr, str);
1294 t = RSTRING_PTR(bitstr);
1295 for (i=0; i<len; i++) {
1296 if (i & 1)
1297 bits <<= 4;
1298 else
1299 bits = (unsigned char)*s++;
1300 *t++ = hexdigits[(bits >> 4) & 15];
1301 }
1302 UNPACK_PUSH(bitstr);
1303 }
1304 break;
1305
1306 case 'c':
1307 signed_p = 1;
1308 integer_size = 1;
1309 bigendian_p = BIGENDIAN_P(); /* not effective */
1310 goto unpack_integer;
1311
1312 case 'C':
1313 signed_p = 0;
1314 integer_size = 1;
1315 bigendian_p = BIGENDIAN_P(); /* not effective */
1316 goto unpack_integer;
1317
1318 case 's':
1319 signed_p = 1;
1320 integer_size = NATINT_LEN(short, 2);
1321 bigendian_p = BIGENDIAN_P();
1322 goto unpack_integer;
1323
1324 case 'S':
1325 signed_p = 0;
1326 integer_size = NATINT_LEN(short, 2);
1327 bigendian_p = BIGENDIAN_P();
1328 goto unpack_integer;
1329
1330 case 'i':
1331 signed_p = 1;
1332 integer_size = (int)sizeof(int);
1333 bigendian_p = BIGENDIAN_P();
1334 goto unpack_integer;
1335
1336 case 'I':
1337 signed_p = 0;
1338 integer_size = (int)sizeof(int);
1339 bigendian_p = BIGENDIAN_P();
1340 goto unpack_integer;
1341
1342 case 'l':
1343 signed_p = 1;
1344 integer_size = NATINT_LEN(long, 4);
1345 bigendian_p = BIGENDIAN_P();
1346 goto unpack_integer;
1347
1348 case 'L':
1349 signed_p = 0;
1350 integer_size = NATINT_LEN(long, 4);
1351 bigendian_p = BIGENDIAN_P();
1352 goto unpack_integer;
1353
1354 case 'q':
1355 signed_p = 1;
1356 integer_size = NATINT_LEN_Q;
1357 bigendian_p = BIGENDIAN_P();
1358 goto unpack_integer;
1359
1360 case 'Q':
1361 signed_p = 0;
1362 integer_size = NATINT_LEN_Q;
1363 bigendian_p = BIGENDIAN_P();
1364 goto unpack_integer;
1365
1366 case 'j':
1367 signed_p = 1;
1368 integer_size = sizeof(intptr_t);
1369 bigendian_p = BIGENDIAN_P();
1370 goto unpack_integer;
1371
1372 case 'J':
1373 signed_p = 0;
1374 integer_size = sizeof(uintptr_t);
1375 bigendian_p = BIGENDIAN_P();
1376 goto unpack_integer;
1377
1378 case 'n':
1379 signed_p = 0;
1380 integer_size = 2;
1381 bigendian_p = 1;
1382 goto unpack_integer;
1383
1384 case 'N':
1385 signed_p = 0;
1386 integer_size = 4;
1387 bigendian_p = 1;
1388 goto unpack_integer;
1389
1390 case 'v':
1391 signed_p = 0;
1392 integer_size = 2;
1393 bigendian_p = 0;
1394 goto unpack_integer;
1395
1396 case 'V':
1397 signed_p = 0;
1398 integer_size = 4;
1399 bigendian_p = 0;
1400 goto unpack_integer;
1401
1402 unpack_integer:
1403 if (explicit_endian) {
1404 bigendian_p = explicit_endian == '>';
1405 }
1406 PACK_LENGTH_ADJUST_SIZE(integer_size);
1407 while (len-- > 0) {
1408 int flags = bigendian_p ? INTEGER_PACK_BIG_ENDIAN : INTEGER_PACK_LITTLE_ENDIAN;
1409 VALUE val;
1410 if (signed_p)
1411 flags |= INTEGER_PACK_2COMP;
1412 val = rb_integer_unpack(s, integer_size, 1, 0, flags);
1413 UNPACK_PUSH(val);
1414 s += integer_size;
1415 }
1416 PACK_ITEM_ADJUST();
1417 break;
1418
1419 case 'f':
1420 case 'F':
1421 PACK_LENGTH_ADJUST_SIZE(sizeof(float));
1422 while (len-- > 0) {
1423 float tmp;
1424 memcpy(&tmp, s, sizeof(float));
1425 s += sizeof(float);
1426 UNPACK_PUSH(DBL2NUM((double)tmp));
1427 }
1428 PACK_ITEM_ADJUST();
1429 break;
1430
1431 case 'e':
1432 PACK_LENGTH_ADJUST_SIZE(sizeof(float));
1433 while (len-- > 0) {
1434 FLOAT_CONVWITH(tmp);
1435 memcpy(tmp.buf, s, sizeof(float));
1436 s += sizeof(float);
1437 VTOHF(tmp);
1438 UNPACK_PUSH(DBL2NUM(tmp.f));
1439 }
1440 PACK_ITEM_ADJUST();
1441 break;
1442
1443 case 'E':
1444 PACK_LENGTH_ADJUST_SIZE(sizeof(double));
1445 while (len-- > 0) {
1446 DOUBLE_CONVWITH(tmp);
1447 memcpy(tmp.buf, s, sizeof(double));
1448 s += sizeof(double);
1449 VTOHD(tmp);
1450 UNPACK_PUSH(DBL2NUM(tmp.d));
1451 }
1452 PACK_ITEM_ADJUST();
1453 break;
1454
1455 case 'D':
1456 case 'd':
1457 PACK_LENGTH_ADJUST_SIZE(sizeof(double));
1458 while (len-- > 0) {
1459 double tmp;
1460 memcpy(&tmp, s, sizeof(double));
1461 s += sizeof(double);
1462 UNPACK_PUSH(DBL2NUM(tmp));
1463 }
1464 PACK_ITEM_ADJUST();
1465 break;
1466
1467 case 'g':
1468 PACK_LENGTH_ADJUST_SIZE(sizeof(float));
1469 while (len-- > 0) {
1470 FLOAT_CONVWITH(tmp);
1471 memcpy(tmp.buf, s, sizeof(float));
1472 s += sizeof(float);
1473 NTOHF(tmp);
1474 UNPACK_PUSH(DBL2NUM(tmp.f));
1475 }
1476 PACK_ITEM_ADJUST();
1477 break;
1478
1479 case 'G':
1480 PACK_LENGTH_ADJUST_SIZE(sizeof(double));
1481 while (len-- > 0) {
1482 DOUBLE_CONVWITH(tmp);
1483 memcpy(tmp.buf, s, sizeof(double));
1484 s += sizeof(double);
1485 NTOHD(tmp);
1486 UNPACK_PUSH(DBL2NUM(tmp.d));
1487 }
1488 PACK_ITEM_ADJUST();
1489 break;
1490
1491 case 'U':
1492 if (len > send - s) len = send - s;
1493 while (len > 0 && s < send) {
1494 long alen = send - s;
1495 unsigned long l;
1496
1497 l = utf8_to_uv(s, &alen);
1498 s += alen; len--;
1499 UNPACK_PUSH(ULONG2NUM(l));
1500 }
1501 break;
1502
1503 case 'u':
1504 {
1505 VALUE buf = infected_str_new(0, (send - s)*3/4, str);
1506 char *ptr = RSTRING_PTR(buf);
1507 long total = 0;
1508
1509 while (s < send && (unsigned char)*s > ' ' && (unsigned char)*s < 'a') {
1510 long a,b,c,d;
1511 char hunk[3];
1512
1513 len = ((unsigned char)*s++ - ' ') & 077;
1514
1515 total += len;
1516 if (total > RSTRING_LEN(buf)) {
1517 len -= total - RSTRING_LEN(buf);
1518 total = RSTRING_LEN(buf);
1519 }
1520
1521 while (len > 0) {
1522 long mlen = len > 3 ? 3 : len;
1523
1524 if (s < send && (unsigned char)*s >= ' ' && (unsigned char)*s < 'a')
1525 a = ((unsigned char)*s++ - ' ') & 077;
1526 else
1527 a = 0;
1528 if (s < send && (unsigned char)*s >= ' ' && (unsigned char)*s < 'a')
1529 b = ((unsigned char)*s++ - ' ') & 077;
1530 else
1531 b = 0;
1532 if (s < send && (unsigned char)*s >= ' ' && (unsigned char)*s < 'a')
1533 c = ((unsigned char)*s++ - ' ') & 077;
1534 else
1535 c = 0;
1536 if (s < send && (unsigned char)*s >= ' ' && (unsigned char)*s < 'a')
1537 d = ((unsigned char)*s++ - ' ') & 077;
1538 else
1539 d = 0;
1540 hunk[0] = (char)(a << 2 | b >> 4);
1541 hunk[1] = (char)(b << 4 | c >> 2);
1542 hunk[2] = (char)(c << 6 | d);
1543 memcpy(ptr, hunk, mlen);
1544 ptr += mlen;
1545 len -= mlen;
1546 }
1547 if (s < send && (unsigned char)*s != '\r' && *s != '\n')
1548 s++; /* possible checksum byte */
1549 if (s < send && *s == '\r') s++;
1550 if (s < send && *s == '\n') s++;
1551 }
1552
1553 rb_str_set_len(buf, total);
1554 UNPACK_PUSH(buf);
1555 }
1556 break;
1557
1558 case 'm':
1559 {
1560 VALUE buf = infected_str_new(0, (send - s + 3)*3/4, str); /* +3 is for skipping paddings */
1561 char *ptr = RSTRING_PTR(buf);
1562 int a = -1,b = -1,c = 0,d = 0;
1563 static signed char b64_xtable[256];
1564
1565 if (b64_xtable['/'] <= 0) {
1566 int i;
1567
1568 for (i = 0; i < 256; i++) {
1569 b64_xtable[i] = -1;
1570 }
1571 for (i = 0; i < 64; i++) {
1572 b64_xtable[(unsigned char)b64_table[i]] = (char)i;
1573 }
1574 }
1575 if (len == 0) {
1576 while (s < send) {
1577 a = b = c = d = -1;
1578 a = b64_xtable[(unsigned char)*s++];
1579 if (s >= send || a == -1) rb_raise(rb_eArgError, "invalid base64");
1580 b = b64_xtable[(unsigned char)*s++];
1581 if (s >= send || b == -1) rb_raise(rb_eArgError, "invalid base64");
1582 if (*s == '=') {
1583 if (s + 2 == send && *(s + 1) == '=') break;
1584 rb_raise(rb_eArgError, "invalid base64");
1585 }
1586 c = b64_xtable[(unsigned char)*s++];
1587 if (s >= send || c == -1) rb_raise(rb_eArgError, "invalid base64");
1588 if (s + 1 == send && *s == '=') break;
1589 d = b64_xtable[(unsigned char)*s++];
1590 if (d == -1) rb_raise(rb_eArgError, "invalid base64");
1591 *ptr++ = castchar(a << 2 | b >> 4);
1592 *ptr++ = castchar(b << 4 | c >> 2);
1593 *ptr++ = castchar(c << 6 | d);
1594 }
1595 if (c == -1) {
1596 *ptr++ = castchar(a << 2 | b >> 4);
1597 if (b & 0xf) rb_raise(rb_eArgError, "invalid base64");
1598 }
1599 else if (d == -1) {
1600 *ptr++ = castchar(a << 2 | b >> 4);
1601 *ptr++ = castchar(b << 4 | c >> 2);
1602 if (c & 0x3) rb_raise(rb_eArgError, "invalid base64");
1603 }
1604 }
1605 else {
1606 while (s < send) {
1607 a = b = c = d = -1;
1608 while ((a = b64_xtable[(unsigned char)*s]) == -1 && s < send) {s++;}
1609 if (s >= send) break;
1610 s++;
1611 while ((b = b64_xtable[(unsigned char)*s]) == -1 && s < send) {s++;}
1612 if (s >= send) break;
1613 s++;
1614 while ((c = b64_xtable[(unsigned char)*s]) == -1 && s < send) {if (*s == '=') break; s++;}
1615 if (*s == '=' || s >= send) break;
1616 s++;
1617 while ((d = b64_xtable[(unsigned char)*s]) == -1 && s < send) {if (*s == '=') break; s++;}
1618 if (*s == '=' || s >= send) break;
1619 s++;
1620 *ptr++ = castchar(a << 2 | b >> 4);
1621 *ptr++ = castchar(b << 4 | c >> 2);
1622 *ptr++ = castchar(c << 6 | d);
1623 a = -1;
1624 }
1625 if (a != -1 && b != -1) {
1626 if (c == -1)
1627 *ptr++ = castchar(a << 2 | b >> 4);
1628 else {
1629 *ptr++ = castchar(a << 2 | b >> 4);
1630 *ptr++ = castchar(b << 4 | c >> 2);
1631 }
1632 }
1633 }
1634 rb_str_set_len(buf, ptr - RSTRING_PTR(buf));
1635 UNPACK_PUSH(buf);
1636 }
1637 break;
1638
1639 case 'M':
1640 {
1641 VALUE buf = infected_str_new(0, send - s, str);
1642 char *ptr = RSTRING_PTR(buf), *ss = s;
1643 int csum = 0;
1644 int c1, c2;
1645
1646 while (s < send) {
1647 if (*s == '=') {
1648 if (++s == send) break;
1649 if (s+1 < send && *s == '\r' && *(s+1) == '\n')
1650 s++;
1651 if (*s != '\n') {
1652 if ((c1 = hex2num(*s)) == -1) break;
1653 if (++s == send) break;
1654 if ((c2 = hex2num(*s)) == -1) break;
1655 csum |= *ptr++ = castchar(c1 << 4 | c2);
1656 }
1657 }
1658 else {
1659 csum |= *ptr++ = *s;
1660 }
1661 s++;
1662 ss = s;
1663 }
1664 rb_str_set_len(buf, ptr - RSTRING_PTR(buf));
1665 rb_str_buf_cat(buf, ss, send-ss);
1666 csum = ISASCII(csum) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
1667 ENCODING_CODERANGE_SET(buf, rb_ascii8bit_encindex(), csum);
1668 UNPACK_PUSH(buf);
1669 }
1670 break;
1671
1672 case '@':
1673 if (len > RSTRING_LEN(str))
1674 rb_raise(rb_eArgError, "@ outside of string");
1675 s = RSTRING_PTR(str) + len;
1676 break;
1677
1678 case 'X':
1679 if (len > s - RSTRING_PTR(str))
1680 rb_raise(rb_eArgError, "X outside of string");
1681 s -= len;
1682 break;
1683
1684 case 'x':
1685 if (len > send - s)
1686 rb_raise(rb_eArgError, "x outside of string");
1687 s += len;
1688 break;
1689
1690 case 'P':
1691 if (sizeof(char *) <= (size_t)(send - s)) {
1692 VALUE tmp = Qnil;
1693 char *t;
1694
1695 memcpy(&t, s, sizeof(char *));
1696 s += sizeof(char *);
1697
1698 if (t) {
1699 VALUE a;
1700 const VALUE *p, *pend;
1701
1702 if (!(a = str_associated(str))) {
1703 rb_raise(rb_eArgError, "no associated pointer");
1704 }
1705 p = RARRAY_CONST_PTR(a);
1706 pend = p + RARRAY_LEN(a);
1707 while (p < pend) {
1708 if (RB_TYPE_P(*p, T_STRING) && RSTRING_PTR(*p) == t) {
1709 if (len < RSTRING_LEN(*p)) {
1710 tmp = rb_tainted_str_new(t, len);
1711 str_associate(tmp, a);
1712 }
1713 else {
1714 tmp = *p;
1715 }
1716 break;
1717 }
1718 p++;
1719 }
1720 if (p == pend) {
1721 rb_raise(rb_eArgError, "non associated pointer");
1722 }
1723 }
1724 UNPACK_PUSH(tmp);
1725 }
1726 break;
1727
1728 case 'p':
1729 if (len > (long)((send - s) / sizeof(char *)))
1730 len = (send - s) / sizeof(char *);
1731 while (len-- > 0) {
1732 if ((size_t)(send - s) < sizeof(char *))
1733 break;
1734 else {
1735 VALUE tmp = Qnil;
1736 char *t;
1737
1738 memcpy(&t, s, sizeof(char *));
1739 s += sizeof(char *);
1740
1741 if (t) {
1742 VALUE a;
1743 const VALUE *p, *pend;
1744
1745 if (!(a = str_associated(str))) {
1746 rb_raise(rb_eArgError, "no associated pointer");
1747 }
1748 p = RARRAY_CONST_PTR(a);
1749 pend = p + RARRAY_LEN(a);
1750 while (p < pend) {
1751 if (RB_TYPE_P(*p, T_STRING) && RSTRING_PTR(*p) == t) {
1752 tmp = *p;
1753 break;
1754 }
1755 p++;
1756 }
1757 if (p == pend) {
1758 rb_raise(rb_eArgError, "non associated pointer");
1759 }
1760 }
1761 UNPACK_PUSH(tmp);
1762 }
1763 }
1764 break;
1765
1766 case 'w':
1767 {
1768 char *s0 = s;
1769 while (len > 0 && s < send) {
1770 if (*s & 0x80) {
1771 s++;
1772 }
1773 else {
1774 s++;
1775 UNPACK_PUSH(rb_integer_unpack(s0, s-s0, 1, 1, INTEGER_PACK_BIG_ENDIAN));
1776 len--;
1777 s0 = s;
1778 }
1779 }
1780 }
1781 break;
1782
1783 default:
1784 unknown_directive("unpack", type, fmt);
1785 break;
1786 }
1787 }
1788
1789 return ary;
1790 }
1791
1792 /*
1793 * call-seq:
1794 * str.unpack(format) -> anArray
1795 *
1796 * Decodes <i>str</i> (which may contain binary data) according to the
1797 * format string, returning an array of each value extracted. The
1798 * format string consists of a sequence of single-character directives,
1799 * summarized in the table at the end of this entry.
1800 * Each directive may be followed
1801 * by a number, indicating the number of times to repeat with this
1802 * directive. An asterisk (``<code>*</code>'') will use up all
1803 * remaining elements. The directives <code>sSiIlL</code> may each be
1804 * followed by an underscore (``<code>_</code>'') or
1805 * exclamation mark (``<code>!</code>'') to use the underlying
1806 * platform's native size for the specified type; otherwise, it uses a
1807 * platform-independent consistent size. Spaces are ignored in the
1808 * format string. See also <code>String#unpack1</code>, <code>Array#pack</code>.
1809 *
1810 * "abc \0\0abc \0\0".unpack('A6Z6') #=> ["abc", "abc "]
1811 * "abc \0\0".unpack('a3a3') #=> ["abc", " \000\000"]
1812 * "abc \0abc \0".unpack('Z*Z*') #=> ["abc ", "abc "]
1813 * "aa".unpack('b8B8') #=> ["10000110", "01100001"]
1814 * "aaa".unpack('h2H2c') #=> ["16", "61", 97]
1815 * "\xfe\xff\xfe\xff".unpack('sS') #=> [-2, 65534]
1816 * "now=20is".unpack('M*') #=> ["now is"]
1817 * "whole".unpack('xax2aX2aX1aX2a') #=> ["h", "e", "l", "l", "o"]
1818 *
1819 * This table summarizes the various formats and the Ruby classes
1820 * returned by each.
1821 *
1822 * Integer | |
1823 * Directive | Returns | Meaning
1824 * ------------------------------------------------------------------
1825 * C | Integer | 8-bit unsigned (unsigned char)
1826 * S | Integer | 16-bit unsigned, native endian (uint16_t)
1827 * L | Integer | 32-bit unsigned, native endian (uint32_t)
1828 * Q | Integer | 64-bit unsigned, native endian (uint64_t)
1829 * J | Integer | pointer width unsigned, native endian (uintptr_t)
1830 * | |
1831 * c | Integer | 8-bit signed (signed char)
1832 * s | Integer | 16-bit signed, native endian (int16_t)
1833 * l | Integer | 32-bit signed, native endian (int32_t)
1834 * q | Integer | 64-bit signed, native endian (int64_t)
1835 * j | Integer | pointer width signed, native endian (intptr_t)
1836 * | |
1837 * S_ S! | Integer | unsigned short, native endian
1838 * I I_ I! | Integer | unsigned int, native endian
1839 * L_ L! | Integer | unsigned long, native endian
1840 * Q_ Q! | Integer | unsigned long long, native endian (ArgumentError
1841 * | | if the platform has no long long type.)
1842 * J! | Integer | uintptr_t, native endian (same with J)
1843 * | |
1844 * s_ s! | Integer | signed short, native endian
1845 * i i_ i! | Integer | signed int, native endian
1846 * l_ l! | Integer | signed long, native endian
1847 * q_ q! | Integer | signed long long, native endian (ArgumentError
1848 * | | if the platform has no long long type.)
1849 * j! | Integer | intptr_t, native endian (same with j)
1850 * | |
1851 * S> s> S!> s!> | Integer | same as the directives without ">" except
1852 * L> l> L!> l!> | | big endian
1853 * I!> i!> | |
1854 * Q> q> Q!> q!> | | "S>" is same as "n"
1855 * J> j> J!> j!> | | "L>" is same as "N"
1856 * | |
1857 * S< s< S!< s!< | Integer | same as the directives without "<" except
1858 * L< l< L!< l!< | | little endian
1859 * I!< i!< | |
1860 * Q< q< Q!< q!< | | "S<" is same as "v"
1861 * J< j< J!< j!< | | "L<" is same as "V"
1862 * | |
1863 * n | Integer | 16-bit unsigned, network (big-endian) byte order
1864 * N | Integer | 32-bit unsigned, network (big-endian) byte order
1865 * v | Integer | 16-bit unsigned, VAX (little-endian) byte order
1866 * V | Integer | 32-bit unsigned, VAX (little-endian) byte order
1867 * | |
1868 * U | Integer | UTF-8 character
1869 * w | Integer | BER-compressed integer (see Array.pack)
1870 *
1871 * Float | |
1872 * Directive | Returns | Meaning
1873 * -----------------------------------------------------------------
1874 * D d | Float | double-precision, native format
1875 * F f | Float | single-precision, native format
1876 * E | Float | double-precision, little-endian byte order
1877 * e | Float | single-precision, little-endian byte order
1878 * G | Float | double-precision, network (big-endian) byte order
1879 * g | Float | single-precision, network (big-endian) byte order
1880 *
1881 * String | |
1882 * Directive | Returns | Meaning
1883 * -----------------------------------------------------------------
1884 * A | String | arbitrary binary string (remove trailing nulls and ASCII spaces)
1885 * a | String | arbitrary binary string
1886 * Z | String | null-terminated string
1887 * B | String | bit string (MSB first)
1888 * b | String | bit string (LSB first)
1889 * H | String | hex string (high nibble first)
1890 * h | String | hex string (low nibble first)
1891 * u | String | UU-encoded string
1892 * M | String | quoted-printable, MIME encoding (see RFC2045)
1893 * m | String | base64 encoded string (RFC 2045) (default)
1894 * | | base64 encoded string (RFC 4648) if followed by 0
1895 * P | String | pointer to a structure (fixed-length string)
1896 * p | String | pointer to a null-terminated string
1897 *
1898 * Misc. | |
1899 * Directive | Returns | Meaning
1900 * -----------------------------------------------------------------
1901 * @ | --- | skip to the offset given by the length argument
1902 * X | --- | skip backward one byte
1903 * x | --- | skip forward one byte
1904 *
1905 * HISTORY
1906 *
1907 * * J, J! j, and j! are available since Ruby 2.3.
1908 * * Q_, Q!, q_, and q! are available since Ruby 2.1.
1909 * * I!<, i!<, I!>, and i!> are available since Ruby 1.9.3.
1910 */
1911
1912 static VALUE
pack_unpack(VALUE str,VALUE fmt)1913 pack_unpack(VALUE str, VALUE fmt)
1914 {
1915 int mode = rb_block_given_p() ? UNPACK_BLOCK : UNPACK_ARRAY;
1916 return pack_unpack_internal(str, fmt, mode);
1917 }
1918
1919 /*
1920 * call-seq:
1921 * str.unpack1(format) -> obj
1922 *
1923 * Decodes <i>str</i> (which may contain binary data) according to the
1924 * format string, returning the first value extracted.
1925 * See also <code>String#unpack</code>, <code>Array#pack</code>.
1926 */
1927
1928 static VALUE
pack_unpack1(VALUE str,VALUE fmt)1929 pack_unpack1(VALUE str, VALUE fmt)
1930 {
1931 return pack_unpack_internal(str, fmt, UNPACK_1);
1932 }
1933
1934 int
rb_uv_to_utf8(char buf[6],unsigned long uv)1935 rb_uv_to_utf8(char buf[6], unsigned long uv)
1936 {
1937 if (uv <= 0x7f) {
1938 buf[0] = (char)uv;
1939 return 1;
1940 }
1941 if (uv <= 0x7ff) {
1942 buf[0] = castchar(((uv>>6)&0xff)|0xc0);
1943 buf[1] = castchar((uv&0x3f)|0x80);
1944 return 2;
1945 }
1946 if (uv <= 0xffff) {
1947 buf[0] = castchar(((uv>>12)&0xff)|0xe0);
1948 buf[1] = castchar(((uv>>6)&0x3f)|0x80);
1949 buf[2] = castchar((uv&0x3f)|0x80);
1950 return 3;
1951 }
1952 if (uv <= 0x1fffff) {
1953 buf[0] = castchar(((uv>>18)&0xff)|0xf0);
1954 buf[1] = castchar(((uv>>12)&0x3f)|0x80);
1955 buf[2] = castchar(((uv>>6)&0x3f)|0x80);
1956 buf[3] = castchar((uv&0x3f)|0x80);
1957 return 4;
1958 }
1959 if (uv <= 0x3ffffff) {
1960 buf[0] = castchar(((uv>>24)&0xff)|0xf8);
1961 buf[1] = castchar(((uv>>18)&0x3f)|0x80);
1962 buf[2] = castchar(((uv>>12)&0x3f)|0x80);
1963 buf[3] = castchar(((uv>>6)&0x3f)|0x80);
1964 buf[4] = castchar((uv&0x3f)|0x80);
1965 return 5;
1966 }
1967 if (uv <= 0x7fffffff) {
1968 buf[0] = castchar(((uv>>30)&0xff)|0xfc);
1969 buf[1] = castchar(((uv>>24)&0x3f)|0x80);
1970 buf[2] = castchar(((uv>>18)&0x3f)|0x80);
1971 buf[3] = castchar(((uv>>12)&0x3f)|0x80);
1972 buf[4] = castchar(((uv>>6)&0x3f)|0x80);
1973 buf[5] = castchar((uv&0x3f)|0x80);
1974 return 6;
1975 }
1976 rb_raise(rb_eRangeError, "pack(U): value out of range");
1977
1978 UNREACHABLE_RETURN(Qnil);
1979 }
1980
1981 static const unsigned long utf8_limits[] = {
1982 0x0, /* 1 */
1983 0x80, /* 2 */
1984 0x800, /* 3 */
1985 0x10000, /* 4 */
1986 0x200000, /* 5 */
1987 0x4000000, /* 6 */
1988 0x80000000, /* 7 */
1989 };
1990
1991 static unsigned long
utf8_to_uv(const char * p,long * lenp)1992 utf8_to_uv(const char *p, long *lenp)
1993 {
1994 int c = *p++ & 0xff;
1995 unsigned long uv = c;
1996 long n;
1997
1998 if (!(uv & 0x80)) {
1999 *lenp = 1;
2000 return uv;
2001 }
2002 if (!(uv & 0x40)) {
2003 *lenp = 1;
2004 rb_raise(rb_eArgError, "malformed UTF-8 character");
2005 }
2006
2007 if (!(uv & 0x20)) { n = 2; uv &= 0x1f; }
2008 else if (!(uv & 0x10)) { n = 3; uv &= 0x0f; }
2009 else if (!(uv & 0x08)) { n = 4; uv &= 0x07; }
2010 else if (!(uv & 0x04)) { n = 5; uv &= 0x03; }
2011 else if (!(uv & 0x02)) { n = 6; uv &= 0x01; }
2012 else {
2013 *lenp = 1;
2014 rb_raise(rb_eArgError, "malformed UTF-8 character");
2015 }
2016 if (n > *lenp) {
2017 rb_raise(rb_eArgError, "malformed UTF-8 character (expected %ld bytes, given %ld bytes)",
2018 n, *lenp);
2019 }
2020 *lenp = n--;
2021 if (n != 0) {
2022 while (n--) {
2023 c = *p++ & 0xff;
2024 if ((c & 0xc0) != 0x80) {
2025 *lenp -= n + 1;
2026 rb_raise(rb_eArgError, "malformed UTF-8 character");
2027 }
2028 else {
2029 c &= 0x3f;
2030 uv = uv << 6 | c;
2031 }
2032 }
2033 }
2034 n = *lenp - 1;
2035 if (uv < utf8_limits[n]) {
2036 rb_raise(rb_eArgError, "redundant UTF-8 sequence");
2037 }
2038 return uv;
2039 }
2040
2041 void
Init_pack(void)2042 Init_pack(void)
2043 {
2044 rb_define_method(rb_cArray, "pack", pack_pack, -1);
2045 rb_define_method(rb_cString, "unpack", pack_unpack, 1);
2046 rb_define_method(rb_cString, "unpack1", pack_unpack1, 1);
2047
2048 id_associated = rb_make_internal_id();
2049 }
2050