1 /*
2 * $Id: Map.xs,v 1.28 1998/03/23 23:57:46 schwartz Exp $
3 *
4 * ALPHA version
5 *
6 * Unicode::Map - C extensions
7 *
8 * Interface documentation at Map.pm
9 *
10 * Copyright (C) 1998, 1999, 2000 Martin Schwartz. All rights reserved.
11 * This program is free software; you can redistribute it and/or
12 * modify it under the same terms as Perl itself.
13 *
14 * Contact: Martin Schwartz <martin@nacho.de>
15 */
16
17 #ifdef __cplusplus
18 extern "C" {
19 #endif
20 #include "EXTERN.h"
21 #include "perl.h"
22 #include "XSUB.h"
23 #ifdef __cplusplus
24 }
25 #endif
26
27 /*
28 * It seems that dowarn isn't defined on some systems, PL_dowarn not on
29 * others. Gisle Aas deals with it this way:
30 */
31 #include "patchlevel.h"
32 #if PATCHLEVEL <= 4 && !defined(PL_dowarn)
33 #define PL_dowarn dowarn
34 #endif
35
36 /*
37 *
38 * "Map.h"
39 *
40 */
41
42 #define M_MAGIC 0xb827 /* magic word */
43 #define MAP8_BINFILE_MAGIC_HI 0xfffe /* magic word for Gisle's file format */
44 #define MAP8_BINFILE_MAGIC_LO 0x0001 /* */
45
46 #define M_END 0 /* end */
47 #define M_INF 1 /* infinite subsequent entries (default) */
48 #define M_BYTE 2 /* 1..255 subsequent entries */
49 #define M_VER 4 /* (Internal) file format revision. */
50 #define M_AKV 6 /* key1, val1, key2, val2, ... (default) */
51 #define M_AKAV 7 /* key1, key2, ..., val1, val2, ... */
52 #define M_PKV 8 /* partial key value mappings */
53 #define M_CKn 10 /* compress keys not */
54 #define M_CK 11 /* compress keys (default) */
55 #define M_CVn 13 /* compress values not */
56 #define M_CV 14 /* compress values (default) */
57
58 #define I_NAME 20 /* Info: (wstring) Character Set Name */
59 #define I_ALIAS 21 /* Info: (wstring) Charset alias (several entries ok) */
60 #define I_VER 22 /* Info: (wstring) Mapfile revision */
61 #define I_AUTH 23 /* Info: (wstring) Mapfile authRess */
62 #define I_INFO 24 /* Info: (wstring) Some userEss definable string */
63
64 #define T_BAD 0 /* Type: unknown */
65 #define T_MAP8 1 /* Type: Map8 style */
66 #define T_MAP 2 /* Type: Map style */
67
68 #define num1_DEFAULT M_INF;
69 #define method1_DEFAULT M_AKV;
70 #define keys1_DEFAULT M_CK;
71 #define values1_DEFAULT M_CV;
72
73 /* No function prototypes (as very old C-Compilers don't like them) */
74
75 /*
76 *
77 * "Map.c"
78 *
79 */
80
_byte(char ** buf)81 U8 _byte(char** buf) {
82 U8* tmp = (U8*) *buf; *buf+=1; return tmp[0];
83 }
_word(char ** buf)84 U16 _word(char** buf) {
85 U16 tmp; memcpy ((char*) &tmp, *buf, 2); *buf+=2; return ntohs(tmp);
86 }
_long(char ** buf)87 U32 _long(char** buf) {
88 U32 tmp; memcpy ((char*) &tmp, *buf, 4); *buf+=4; return ntohl(tmp);
89 }
90
__system_test(void)91 AV* __system_test (void) {
92 /*
93 * If this test suit gets passed ok, the C methods will probably work.
94 */
95 char* check = "\x01\x04\xfe\x83\x73\xf8\x04\x59\x19";
96 char* buf;
97 AV* list = newAV();
98 U32 i, k;
99
100 /*
101 * Have the Unn the bytesize I assume?
102 */
103 if (sizeof(U8)!=1) { av_push (list, newSVpv("1a", 1)); }
104 if (sizeof(U16)!=2) { av_push (list, newSVpv("1b", 1)); }
105 if (sizeof(U32)!=4) { av_push (list, newSVpv("1c", 1)); }
106
107 /*
108 * Does _byte work?
109 */
110 buf = check;
111 if (_byte(&buf) != 0x01) { av_push(list, newSVpv("2a", 2)); }
112 if (_byte(&buf) != 0x04) { av_push(list, newSVpv("2b", 2)); }
113 if (_byte(&buf) != 0xfe) { av_push(list, newSVpv("2c", 2)); }
114 if (_byte(&buf) != 0x83) { av_push(list, newSVpv("2d", 2)); }
115
116 /*
117 * Are _word and _long really reading Network order?
118 */
119 if (_word(&buf) != 0x73f8) { av_push(list, newSVpv("3a", 2)); }
120 if (_word(&buf) != 0x0459) { av_push(list, newSVpv("3b", 2)); }
121 buf = check + 1;
122 if (_byte(&buf) != 0x04) { av_push(list, newSVpv("4a", 2)); }
123 if (_long(&buf) != 0xfe8373f8) { av_push(list, newSVpv("4b", 2)); }
124
125 /*
126 * Is U32 really not an I32?
127 */
128 buf = check + 2;
129 i = _long(&buf);
130 i ++;
131 if (i != 0xfe8373f9) { av_push(list, newSVpv("5", 1)); }
132
133 k = htonl(0x12345678);
134 if (memcmp((char*)&k+(4-1), "\x78", 1)) {
135 av_push(list, newSVpv("6a", 2));
136 }
137 if (memcmp((char*)&k+(4-2), "\x56\x78", 2)) {
138 av_push(list, newSVpv("6b", 2));
139 }
140 if (memcmp((char*)&k+(4-4), "\x12\x34\x56\x78", 4)) {
141 av_push(list, newSVpv("6c", 2));
142 }
143
144 return (list);
145 }
146
147 int
__limit_ol(SV * string,SV * o,SV * l,char ** ro,U32 * rl,U16 cs)148 __limit_ol (SV* string, SV* o, SV* l, char** ro, U32* rl, U16 cs) {
149 /*
150 * Checks, if offset and length are valid. If offset is negative, it is
151 * treated like a negative offset in perl.
152 *
153 * When successful, sets ro (real offset) and rl (real length).
154 */
155 STRLEN slen;
156 char* address;
157 I32 offset;
158 U32 length;
159
160 *ro = 0;
161 *rl = 0;
162
163 if (!SvOK(string)) {
164 if (PL_dowarn) { warn ("String undefined!"); }
165 return (0);
166 }
167
168 address = SvPV (string, slen);
169 offset = SvOK(o) ? SvIV(o) : 0;
170 length = SvOK(l) ? SvIV(l) : slen;
171
172 if (offset < 0) {
173 offset += slen;
174 }
175
176 if (offset < 0) {
177 offset = 0;
178 length = slen;
179 if (PL_dowarn) { warn ("Bad negative string offset!"); }
180 }
181
182 if (offset > slen) {
183 offset = slen;
184 length = 0;
185 if (PL_dowarn) { warn ("String offset to big!"); }
186 }
187
188 if (offset + length > slen) {
189 length = slen - offset;
190 if (PL_dowarn) { warn ("Bad string length!"); }
191 }
192
193 if (length % cs != 0) {
194 if (length>cs) {
195 length -= (length % cs);
196 } else {
197 length = 0;
198 }
199 if (PL_dowarn) { warn("Bad string size!"); }
200 }
201
202 *ro = address + offset;
203 *rl = length;
204
205 return (1);
206 }
207
208 int
__get_mode(char ** buf,U8 * num,U8 * method,U8 * keys,U8 * values)209 __get_mode (char** buf, U8* num, U8* method, U8* keys, U8* values) {
210 U8 type, size;
211
212 type = _byte(buf);
213 size = _byte(buf); *buf += size;
214
215 switch (type) {
216 case M_INF:
217 case M_BYTE:
218 *num = type; break;
219 case M_AKV:
220 case M_AKAV:
221 case M_PKV:
222 *method = type; break;
223 case M_CKn:
224 case M_CK:
225 *keys = type; break;
226 case M_CVn:
227 case M_CV:
228 *values = type; break;
229 }
230 return (type);
231 }
232
233 /*
234 * void = __read_binary_mapping (bufS, oS, UR, CR)
235 *
236 * Table of mode combinations:
237 *
238 * Mode | n1 n2 | INF BYTE | CK CKn | CV CVn
239 * ---------------------------------------------------------
240 * AKV | | | |
241 * AKAV | | | |
242 * PKV ok | ==1 ==1 | ok | ok | ok
243 */
244 int
__read_binary_mapping(SV * bufS,SV * oS,SV * UR,SV * CR)245 __read_binary_mapping (SV* bufS, SV* oS, SV* UR, SV* CR) {
246 char* buf;
247 U32 o;
248 HV* U; SV* uR; HV* u;
249 HV* C; SV* cR; HV* c;
250
251 int buflen;
252 char* bufmax;
253 U8 cs1, cs1b, cs2, cs2b;
254 U32 n1, n2;
255 U16 check;
256 U16 type=T_BAD;
257 U8 num1, method1, keys1, values1;
258 I16 kn, vn;
259 U32 kbegin, vbegin;
260 SV* Ustr;
261 SV* Cstr;
262 SV** tmp_spp;
263
264 buf = SvPVX (bufS);
265 o = SvIV (oS);
266 U = (HV *) SvRV (UR);
267 C = (HV *) SvRV (CR);
268
269 buflen = SvCUR(bufS); if (buflen < 2) {
270 /*
271 * Too short file. (No place for magic)
272 */
273 if ( PL_dowarn ) { warn ( "Bad map file: too short!" ); }
274 return (0);
275 }
276 bufmax = buf + buflen;
277 buf += o;
278 check = _word(&buf);
279
280 if (check == M_MAGIC) {
281 type = T_MAP;
282 } else if (
283 ( check == MAP8_BINFILE_MAGIC_HI ) &&
284 ( _word(&buf) == MAP8_BINFILE_MAGIC_LO )
285 ) {
286 type = T_MAP8;
287 }
288
289 if (type == T_BAD) {
290 if ( PL_dowarn ) { warn ( "Unknown map file format!" ); }
291 return (0);
292 }
293
294 num1 = num1_DEFAULT;
295 method1 = method1_DEFAULT;
296 keys1 = keys1_DEFAULT;
297 values1 = values1_DEFAULT;
298
299 while (buf<bufmax) {
300 U8 num2, method2, keys2, values2;
301 num2=num1; method2=method1; keys2=keys1; values2=values1;
302
303 if (type == T_MAP) {
304 cs1 = _byte (&buf);
305 if (!cs1) {
306 if (__get_mode(&buf, &num1, &method1, &keys1, &values1) == M_END) {
307 break;
308 }
309 continue;
310 } else {
311 n1 = _byte (&buf);
312 cs2 = _byte (&buf);
313 n2 = _byte (&buf);
314 }
315 cs1b = (cs1+7)/8;
316 cs2b = (cs2+7)/8;
317 } else if (type == T_MAP8) {
318 cs1b=1; n1=1; cs2b=2; n2=1;
319 }
320
321 Ustr = newSVpvf ("%d,%d,%d,%d", cs1b, n1, cs2b, n2);
322 Cstr = newSVpvf ("%d,%d,%d,%d", cs2b, n2, cs1b, n1);
323
324 /*
325 * Get, create hash for submapping of %U
326 */
327 if (!hv_exists_ent(U, Ustr, 0)) {
328 hv_store_ent(U, Ustr, newRV_inc((SV*) newHV()), 0);
329 }
330 tmp_spp = hv_fetch(U, SvPVX(Ustr), SvCUR(Ustr), 0);
331 if (!tmp_spp) {
332 if ( PL_dowarn ) { warn ( "Can't retrieve U submapping!" ); }
333 return (0);
334 } else {
335 uR = (SV *) *tmp_spp;
336 u = (HV *) SvRV (uR);
337 }
338
339 /*
340 * Get, create hash for submapping of %C
341 */
342 if (!hv_exists_ent(C, Cstr, 0)) {
343 hv_store_ent(C, Cstr, newRV_inc((SV*) newHV()), 0);
344 }
345 tmp_spp = hv_fetch(C, SvPVX(Cstr), SvCUR(Cstr), 0);
346 if (!tmp_spp) {
347 if ( PL_dowarn ) { warn ( "Can't retrieve C submapping!" ); }
348 return (0);
349 } else {
350 cR = (SV *) *tmp_spp;
351 c = (HV *) SvRV (cR);
352 }
353
354 if (type == T_MAP8) {
355 /*
356 * Map8 mode
357 */
358 /*
359 * => All (key, value) pairs
360 */
361 SV* tmpk; SV* tmpv;
362 while (buf<bufmax) {
363 if (buf[0] != '\0') {
364 if ( PL_dowarn ) { warn ( "Bad map file!" ); }
365 return (0);
366 }
367 tmpk = newSVpv(buf+1, 1); buf += 2;
368 tmpv = newSVpv(buf , 2); buf += 2;
369 if (buf > bufmax) { break; }
370
371 hv_store_ent(u, tmpk, tmpv, 0);
372 hv_store_ent(c, tmpv, tmpk, 0);
373 }
374 } else if (method1==M_AKV) {
375 /*
376 * Map mode
377 */
378 U32 ksize = n1*cs1b; SV* tmpk;
379 U32 vsize = n2*cs2b; SV* tmpv;
380 if ( num1==M_INF ) {
381 /*
382 * All (key, value) pairs
383 */
384 while (buf<bufmax) {
385 if ( buf+ksize+vsize>bufmax ) {
386 buf += ( ksize+vsize );
387 break;
388 }
389 tmpk = newSVpv(buf, ksize); buf += ksize;
390 tmpv = newSVpv(buf, vsize); buf += vsize;
391 hv_store_ent(c, tmpv, tmpk, 0);
392 hv_store_ent(u, tmpk, tmpv, 0);
393 }
394 } else if ( num1==M_BYTE ) {
395 while ( buf<bufmax ) {
396 if (!(kn=_byte(&buf))) {
397 if (__get_mode(&buf,&num2,&method2,&keys2,&values2)==M_END) {
398 break;
399 }
400 }
401 while ( kn>0 ) {
402 if ( buf+ksize+vsize>bufmax ) {
403 buf += ( ksize+vsize );
404 break;
405 }
406 tmpk = newSVpv(buf, ksize); buf += ksize;
407 tmpv = newSVpv(buf, vsize); buf += vsize;
408 hv_store_ent(c, tmpv, tmpk, 0);
409 hv_store_ent(u, tmpk, tmpv, 0);
410 kn--;
411 }
412 }
413 }
414 } else if (method1==M_AKAV) {
415 /*
416 * First all keys, then all values
417 */
418 if ( PL_dowarn ) { warn ( "M_AKAV not supported!" ); }
419 return (0);
420 } else if (method1==M_PKV) {
421 /*
422 * Partial
423 */
424 if (num1==M_INF) {
425 /* no infinite mode */
426 if ( PL_dowarn ) { warn ( "M_INF not supported for M_PKV!" ); }
427 return (0);
428 }
429 while(buf<bufmax) {
430 U8 num3, method3, keys3, values3;
431 num3=num2; method3=method2; keys3=keys2; values3=values2;
432 if (!(kn = _byte(&buf))) {
433 if (__get_mode(&buf,&num2,&method2,&keys2,&values2)==M_END) {
434 break;
435 }
436 continue;
437 }
438 switch (cs1b) {
439 case 1: kbegin = _byte(&buf); break;
440 case 2: kbegin = _word(&buf); break;
441 case 4: kbegin = _long(&buf); break;
442 default:
443 if ( PL_dowarn ) { warn ( "Unknown element size!" ); }
444 return (0);
445 }
446 while (kn>0) {
447 if (values3==M_CV) {
448 /*
449 * Partial, keys compressed, values compressed
450 */
451 SV* tmpk; U32 k;
452 SV* tmpv; U32 v;
453 U32 max;
454 vn = _byte(&buf);
455 if (!vn) {
456 if(__get_mode(&buf,&num3,&method3,&keys3,&values3)==M_END){
457 break;
458 }
459 continue;
460 }
461 if ((n1 != 1) || (n2 != 1)) {
462 /*
463 * n (n>1) characters cannot be mapped to one integer
464 */
465 if ( PL_dowarn ) { warn("Bad map file: count mismatch!"); }
466 return (0);
467 }
468 switch (cs2b) {
469 case 1: vbegin = _byte(&buf); break;
470 case 2: vbegin = _word(&buf); break;
471 case 4: vbegin = _long(&buf); break;
472 default:
473 if ( PL_dowarn ) { warn ( "Unknown element size!" ); }
474 return (0);
475 }
476
477 max = kbegin + vn;
478 for (; kbegin<max; kbegin++, vbegin++) {
479
480 k = htonl(kbegin);
481 tmpk = newSVpv((char *) &k + (4-cs1b), cs1b);
482
483 v = htonl(vbegin);
484 tmpv = newSVpv((char *) &v + (4-cs2b), cs2b);
485
486 hv_store_ent(c, tmpv, tmpk, 0);
487 hv_store_ent(u, tmpk, tmpv, 0);
488 }
489 kn-=vn;
490
491 } else if (values3==M_CVn) {
492 /*
493 * Partial, keys compressed, values not compressed
494 */
495 U32 v;
496 U32 vsize = n2*cs2b;
497 SV* tmpk;
498 SV* tmpv;
499 if (n1 != 1) {
500 if ( PL_dowarn ) { warn ( "Bad map file: mismatch 2!" ); }
501 return (0);
502 }
503 while (kn--) {
504 v = htonl(kbegin);
505 tmpk = newSVpv((char *) &v + (4-cs1b), cs1b);
506 tmpv = newSVpv(buf, vsize); buf += vsize;
507
508 hv_store_ent(u, tmpk, tmpv, 0);
509 hv_store_ent(c, tmpv, tmpk, 0);
510
511 kbegin++;
512 }
513 } else {
514 /*
515 * Unknown value compression.
516 */
517 if ( PL_dowarn ) { warn ( "Unknown compression!" ); }
518 return (0);
519 }
520 }
521 }
522 } else {
523 /*
524 * unknown method
525 */
526 if ( PL_dowarn ) { warn ( "Unknown method!" ); }
527 return (0);
528 }
529 }
530
531 return (1);
532 }
533
534 /*
535 *
536 * "Map.xs"
537 *
538 */
539
540 MODULE = Unicode::Map PACKAGE = Unicode::Map
541
542 PROTOTYPES: DISABLE
543
544 #
545 # $text = $Map -> reverse_unicode($text)
546 #
547 SV*
548 _reverse_unicode(Map, text)
549 SV* Map
550 SV* text
551
552 PREINIT:
553 int i;
554 char c;
555 STRLEN len;
556 char* src;
557 char* dest;
558
559 PPCODE:
560 src = SvPV (text, len);
561 if (PL_dowarn && (len % 2) != 0) {
562 warn("Bad string size!"); len--;
563 }
564 /* Code below adapted from GAAS's Unicode::String */
565 if ( GIMME_V == G_VOID ) {
566 if ( SvREADONLY(text) ) {
567 die ( "reverse_unicode: string is readonly!" );
568 }
569 dest = src;
570 } else {
571 SV* dest_sv = sv_2mortal ( newSV(len+1) );
572 SvCUR_set ( dest_sv, len );
573 *SvEND ( dest_sv ) = 0;
574 SvPOK_on ( dest_sv );
575 PUSHs ( dest_sv );
576 dest = SvPVX ( dest_sv );
577 }
578 for ( ; len>=2; len-=2 ) {
579 char tmp = *src++;
580 *dest++ = *src++;
581 *dest++ = tmp;
582 }
583
584 #
585 # $mapped_str = $Map -> _map_hash($string, \%mapping, $bytesize, offset, length)
586 #
587 # bytesize, offset, length in terms of bytes.
588 #
589 # bytesize gives the size of one character for this mapping.
590 #
591 SV*
592 _map_hash(Map, string, mappingR, bytesize, o, l)
593 SV* Map
594 SV* string
595 SV* mappingR
596 SV* bytesize
597 SV* o
598 SV* l
599
600 PREINIT:
601 char* offset; U32 length; U16 bs;
602 char* smax;
603 HV* mapping;
604 SV** tmp;
605
606 CODE:
607 bs = SvIV(bytesize);
608 __limit_ol (string, o, l, &offset, &length, bs);
609 smax = offset + length;
610
611 RETVAL = newSV((length/bs+1)*2);
612 mapping = (HV *) SvRV(mappingR);
613
614 for (; offset<smax; offset+=bs) {
615 if (tmp = hv_fetch(mapping, offset, bs, 0)) {
616 if ( SvOK(RETVAL) ) {
617 sv_catsv(RETVAL, *tmp);
618 } else {
619 sv_setsv(RETVAL, *tmp);
620 }
621 } else {
622 /* No mapping character found! */
623 }
624 }
625
626 OUTPUT:
627 RETVAL
628
629
630 #
631 # $mapped_str = $Map -> _map_hashlist($string, [@{\%mapping}], [@{$bytesize}])
632 #
633 # bytesize gives the size of one character for this mapping.
634 #
635 SV*
636 _map_hashlist(Map, string, mappingRLR, bytesizeLR, o, l)
637 SV* Map
638 SV* string
639 SV* mappingRLR
640 SV* bytesizeLR
641 SV* o
642 SV* l
643
644 PREINIT:
645 int j, max;
646 AV* mappingRL; HV* mapping;
647 AV* bytesizeL; int bytesize;
648 SV** tmp;
649 char* offset; U32 length; char* smax;
650
651 CODE:
652 __limit_ol (string, o, l, &offset, &length, 1);
653 smax = offset + length;
654
655 RETVAL = newSV((length+1)*2);
656
657 mappingRL = (AV *) SvRV(mappingRLR);
658 bytesizeL = (AV *) SvRV(bytesizeLR);
659 max = av_len(mappingRL);
660 if (max != av_len(bytesizeL)) {
661 warn("$#mappingRL != $#bytesizeL!");
662 } else {
663 max++;
664 for (; offset<smax; ) {
665 for (j=0; j<=max; j++) {
666 if (j==max) {
667 /* No mapping character found!
668 * How many bytes does this unknown character consume?
669 * Sigh, assume 2.
670 */
671 offset += 2;
672 } else {
673 if (tmp = av_fetch(mappingRL, j, 0)) {
674 mapping = (HV *) SvRV((SV*) *tmp);
675 if (tmp = av_fetch(bytesizeL, j, 0)) {
676 bytesize = SvIV(*tmp);
677 if (tmp = hv_fetch(mapping, offset, bytesize, 0)) {
678 if ( SvOK(RETVAL) ) {
679 sv_catsv(RETVAL, *tmp);
680 } else {
681 sv_setsv(RETVAL, *tmp);
682 }
683 offset+=bytesize;
684 break;
685 }
686 }
687 }
688 }
689 }
690 }
691 }
692
693 OUTPUT:
694 RETVAL
695
696
697 #
698 # status = $S->_read_binary_mapping($buf, $o, \%U, \%C);
699 #
700 SV*
701 _read_binary_mapping (MapS, bufS, oS, UR, CR)
702 SV* MapS
703 SV* bufS
704 SV* oS
705 SV* UR
706 SV* CR
707
708 CODE:
709 RETVAL = newSViv(__read_binary_mapping(bufS, oS, UR, CR));
710
711 OUTPUT:
712 RETVAL
713
714
715 #
716 # 0 || errornum = $S->_test ()
717 #
718 AV*
719 _system_test (void)
720 CODE:
721 RETVAL = __system_test();
722 OUTPUT:
723 RETVAL
724
725