1 /*
2  * $Id: Map.xs,v 1.28 1998/03/23 23:57:46 schwartz Exp $
3  *
4  * ALPHA version
5  *
6  * Unicode::Map - C extensions
7  *
8  * Interface documentation at Map.pm
9  *
10  * Copyright (C) 1998, 1999, 2000 Martin Schwartz. All rights reserved.
11  * This program is free software; you can redistribute it and/or
12  * modify it under the same terms as Perl itself.
13  *
14  * Contact: Martin Schwartz <martin@nacho.de>
15  */
16 
17 #ifdef __cplusplus
18 extern "C" {
19 #endif
20 #include "EXTERN.h"
21 #include "perl.h"
22 #include "XSUB.h"
23 #ifdef __cplusplus
24 }
25 #endif
26 
27 /*
28  * It seems that dowarn isn't defined on some systems, PL_dowarn not on
29  * others. Gisle Aas deals with it this way:
30  */
31 #include "patchlevel.h"
32 #if PATCHLEVEL <= 4 && !defined(PL_dowarn)
33    #define PL_dowarn dowarn
34 #endif
35 
36 /*
37  *
38  * "Map.h"
39  *
40  */
41 
42 #define M_MAGIC               0xb827 /* magic word */
43 #define MAP8_BINFILE_MAGIC_HI 0xfffe /* magic word for Gisle's file format */
44 #define MAP8_BINFILE_MAGIC_LO 0x0001 /* */
45 
46 #define M_END   0       /* end */
47 #define M_INF   1       /* infinite subsequent entries (default) */
48 #define M_BYTE  2       /* 1..255 subsequent entries  */
49 #define M_VER   4       /* (Internal) file format revision. */
50 #define M_AKV   6       /* key1, val1, key2, val2, ... (default) */
51 #define M_AKAV  7       /* key1, key2, ..., val1, val2, ... */
52 #define M_PKV   8       /* partial key value mappings */
53 #define M_CKn   10      /* compress keys not */
54 #define M_CK    11      /* compress keys (default) */
55 #define M_CVn   13      /* compress values not */
56 #define M_CV    14      /* compress values (default) */
57 
58 #define I_NAME  20      /* Info: (wstring) Character Set Name */
59 #define I_ALIAS 21      /* Info: (wstring) Charset alias (several entries ok) */
60 #define I_VER   22      /* Info: (wstring) Mapfile revision */
61 #define I_AUTH  23 	/* Info: (wstring) Mapfile authRess */
62 #define I_INFO  24      /* Info: (wstring) Some userEss definable string */
63 
64 #define T_BAD   0	/* Type: unknown */
65 #define T_MAP8  1	/* Type: Map8 style */
66 #define T_MAP   2	/* Type: Map style */
67 
68 #define num1_DEFAULT    M_INF;
69 #define method1_DEFAULT M_AKV;
70 #define keys1_DEFAULT   M_CK;
71 #define values1_DEFAULT M_CV;
72 
73 /* No function prototypes (as very old C-Compilers don't like them) */
74 
75 /*
76  *
77  * "Map.c"
78  *
79  */
80 
_byte(char ** buf)81 U8  _byte(char** buf) {
82    U8* tmp = (U8*) *buf; *buf+=1; return tmp[0];
83 }
_word(char ** buf)84 U16 _word(char** buf) {
85    U16 tmp; memcpy ((char*) &tmp, *buf, 2); *buf+=2; return ntohs(tmp);
86 }
_long(char ** buf)87 U32 _long(char** buf) {
88    U32 tmp; memcpy ((char*) &tmp, *buf, 4); *buf+=4; return ntohl(tmp);
89 }
90 
__system_test(void)91 AV* __system_test (void) {
92 /*
93  * If this test suit gets passed ok, the C methods will probably work.
94  */
95    char* check = "\x01\x04\xfe\x83\x73\xf8\x04\x59\x19";
96    char* buf;
97    AV*   list = newAV();
98    U32   i, k;
99 
100    /*
101     * Have the Unn the bytesize I assume?
102     */
103    if (sizeof(U8)!=1)  { av_push (list, newSVpv("1a", 1)); }
104    if (sizeof(U16)!=2) { av_push (list, newSVpv("1b", 1)); }
105    if (sizeof(U32)!=4) { av_push (list, newSVpv("1c", 1)); }
106 
107    /*
108     * Does _byte work?
109     */
110    buf = check;
111    if (_byte(&buf) != 0x01) { av_push(list, newSVpv("2a", 2)); }
112    if (_byte(&buf) != 0x04) { av_push(list, newSVpv("2b", 2)); }
113    if (_byte(&buf) != 0xfe) { av_push(list, newSVpv("2c", 2)); }
114    if (_byte(&buf) != 0x83) { av_push(list, newSVpv("2d", 2)); }
115 
116    /*
117     * Are _word and _long really reading Network order?
118     */
119    if (_word(&buf) != 0x73f8)     { av_push(list, newSVpv("3a", 2)); }
120    if (_word(&buf) != 0x0459)     { av_push(list, newSVpv("3b", 2)); }
121    buf = check + 1;
122    if (_byte(&buf) != 0x04)       { av_push(list, newSVpv("4a", 2)); }
123    if (_long(&buf) != 0xfe8373f8) { av_push(list, newSVpv("4b", 2)); }
124 
125    /*
126     * Is U32 really not an I32?
127     */
128    buf = check + 2;
129    i = _long(&buf);
130    i ++;
131    if (i != 0xfe8373f9) { av_push(list, newSVpv("5", 1)); }
132 
133    k = htonl(0x12345678);
134    if (memcmp((char*)&k+(4-1), "\x78", 1)) {
135       av_push(list, newSVpv("6a", 2));
136    }
137    if (memcmp((char*)&k+(4-2), "\x56\x78", 2)) {
138       av_push(list, newSVpv("6b", 2));
139    }
140    if (memcmp((char*)&k+(4-4), "\x12\x34\x56\x78", 4)) {
141       av_push(list, newSVpv("6c", 2));
142    }
143 
144    return (list);
145 }
146 
147 int
__limit_ol(SV * string,SV * o,SV * l,char ** ro,U32 * rl,U16 cs)148 __limit_ol (SV* string, SV* o, SV* l, char** ro, U32* rl, U16 cs) {
149 /*
150  * Checks, if offset and length are valid. If offset is negative, it is
151  * treated like a negative offset in perl.
152  *
153  * When successful, sets ro (real offset) and rl (real length).
154  */
155    STRLEN  slen;
156    char*   address;
157    I32     offset;
158    U32     length;
159 
160    *ro = 0;
161    *rl = 0;
162 
163    if (!SvOK(string)) {
164       if (PL_dowarn) { warn ("String undefined!"); }
165       return (0);
166    }
167 
168    address = SvPV (string, slen);
169    offset  = SvOK(o) ? SvIV(o) : 0;
170    length  = SvOK(l) ? SvIV(l) : slen;
171 
172    if (offset < 0) {
173       offset += slen;
174    }
175 
176    if (offset < 0) {
177       offset = 0;
178       length = slen;
179       if (PL_dowarn) { warn ("Bad negative string offset!"); }
180    }
181 
182    if (offset > slen) {
183       offset = slen;
184       length = 0;
185       if (PL_dowarn) { warn ("String offset to big!"); }
186    }
187 
188    if (offset + length > slen) {
189       length = slen - offset;
190       if (PL_dowarn) { warn ("Bad string length!"); }
191    }
192 
193    if (length % cs != 0) {
194       if (length>cs) {
195          length -= (length % cs);
196       } else {
197          length = 0;
198       }
199       if (PL_dowarn) { warn("Bad string size!"); }
200    }
201 
202    *ro = address + offset;
203    *rl = length;
204 
205    return (1);
206 }
207 
208 int
__get_mode(char ** buf,U8 * num,U8 * method,U8 * keys,U8 * values)209 __get_mode (char** buf, U8* num, U8* method, U8* keys, U8* values) {
210    U8 type, size;
211 
212    type = _byte(buf);
213    size = _byte(buf); *buf += size;
214 
215    switch (type) {
216       case M_INF:
217       case M_BYTE:
218          *num = type; break;
219       case M_AKV:
220       case M_AKAV:
221       case M_PKV:
222          *method = type; break;
223       case M_CKn:
224       case M_CK:
225          *keys = type; break;
226       case M_CVn:
227       case M_CV:
228          *values = type; break;
229    }
230    return (type);
231 }
232 
233 /*
234  *  void = __read_binary_mapping (bufS, oS, UR, CR)
235  *
236  *  Table of mode combinations:
237  *
238  *  Mode      | n1  n2  | INF  BYTE  |  CK  CKn  |  CV  CVn
239  *  ---------------------------------------------------------
240  *  AKV       |         |            |           |
241  *  AKAV      |         |            |           |
242  *  PKV   ok  | ==1 ==1 |      ok    |  ok       |  ok
243  */
244 int
__read_binary_mapping(SV * bufS,SV * oS,SV * UR,SV * CR)245 __read_binary_mapping (SV* bufS, SV* oS, SV* UR, SV* CR) {
246    char* buf;
247    U32 o;
248    HV* U; SV* uR; HV* u;
249    HV* C; SV* cR; HV* c;
250 
251    int   buflen;
252    char* bufmax;
253    U8    cs1, cs1b, cs2, cs2b;
254    U32   n1, n2;
255    U16   check;
256    U16   type=T_BAD;
257    U8    num1, method1, keys1, values1;
258    I16   kn, vn;
259    U32   kbegin, vbegin;
260    SV*   Ustr;
261    SV*   Cstr;
262    SV**  tmp_spp;
263 
264    buf =        SvPVX (bufS);
265    o   =        SvIV (oS);
266    U   = (HV *) SvRV (UR);
267    C   = (HV *) SvRV (CR);
268 
269    buflen = SvCUR(bufS); if (buflen < 2) {
270       /*
271        * Too short file. (No place for magic)
272        */
273       if ( PL_dowarn ) { warn ( "Bad map file: too short!" ); }
274       return (0);
275    }
276    bufmax = buf + buflen;
277    buf += o;
278    check = _word(&buf);
279 
280    if (check == M_MAGIC) {
281       type = T_MAP;
282    } else if (
283       ( check == MAP8_BINFILE_MAGIC_HI ) &&
284       ( _word(&buf) == MAP8_BINFILE_MAGIC_LO )
285    ) {
286       type = T_MAP8;
287    }
288 
289    if (type == T_BAD) {
290       if ( PL_dowarn ) { warn ( "Unknown map file format!" ); }
291       return (0);
292    }
293 
294    num1    = num1_DEFAULT;
295    method1 = method1_DEFAULT;
296    keys1   = keys1_DEFAULT;
297    values1 = values1_DEFAULT;
298 
299    while (buf<bufmax) {
300       U8 num2, method2, keys2, values2;
301       num2=num1; method2=method1; keys2=keys1; values2=values1;
302 
303       if (type == T_MAP) {
304          cs1 = _byte (&buf);
305          if (!cs1) {
306             if (__get_mode(&buf, &num1, &method1, &keys1, &values1) == M_END) {
307                break;
308             }
309             continue;
310          } else {
311             n1  = _byte (&buf);
312             cs2 = _byte (&buf);
313             n2  = _byte (&buf);
314          }
315          cs1b = (cs1+7)/8;
316          cs2b = (cs2+7)/8;
317       } else if (type == T_MAP8) {
318          cs1b=1; n1=1; cs2b=2; n2=1;
319       }
320 
321       Ustr = newSVpvf ("%d,%d,%d,%d", cs1b, n1, cs2b, n2);
322       Cstr = newSVpvf ("%d,%d,%d,%d", cs2b, n2, cs1b, n1);
323 
324       /*
325        * Get, create hash for submapping of %U
326        */
327       if (!hv_exists_ent(U, Ustr, 0)) {
328          hv_store_ent(U, Ustr, newRV_inc((SV*) newHV()), 0);
329       }
330       tmp_spp = hv_fetch(U, SvPVX(Ustr), SvCUR(Ustr), 0);
331       if (!tmp_spp) {
332          if ( PL_dowarn ) { warn ( "Can't retrieve U submapping!" ); }
333          return (0);
334       } else {
335          uR = (SV *) *tmp_spp;
336          u  = (HV *) SvRV (uR);
337       }
338 
339       /*
340        * Get, create hash for submapping of %C
341        */
342       if (!hv_exists_ent(C, Cstr, 0)) {
343          hv_store_ent(C, Cstr, newRV_inc((SV*) newHV()), 0);
344       }
345       tmp_spp = hv_fetch(C, SvPVX(Cstr), SvCUR(Cstr), 0);
346       if (!tmp_spp) {
347          if ( PL_dowarn ) { warn ( "Can't retrieve C submapping!" ); }
348          return (0);
349       } else {
350          cR = (SV *) *tmp_spp;
351          c  = (HV *) SvRV (cR);
352       }
353 
354       if (type == T_MAP8) {
355       /*
356        * Map8 mode
357        */
358          /*
359           * => All (key, value) pairs
360           */
361          SV* tmpk; SV* tmpv;
362          while (buf<bufmax) {
363             if (buf[0] != '\0') {
364                if ( PL_dowarn ) { warn ( "Bad map file!" ); }
365                return (0);
366             }
367             tmpk = newSVpv(buf+1, 1); buf += 2;
368             tmpv = newSVpv(buf  , 2); buf += 2;
369             if (buf > bufmax) { break; }
370 
371             hv_store_ent(u, tmpk, tmpv, 0);
372             hv_store_ent(c, tmpv, tmpk, 0);
373          }
374       } else if (method1==M_AKV) {
375       /*
376        * Map mode
377        */
378          U32 ksize = n1*cs1b; SV* tmpk;
379          U32 vsize = n2*cs2b; SV* tmpv;
380          if ( num1==M_INF ) {
381             /*
382              * All (key, value) pairs
383              */
384             while (buf<bufmax) {
385                if ( buf+ksize+vsize>bufmax ) {
386                   buf += ( ksize+vsize );
387                   break;
388                }
389                tmpk = newSVpv(buf, ksize); buf += ksize;
390                tmpv = newSVpv(buf, vsize); buf += vsize;
391                hv_store_ent(c, tmpv, tmpk, 0);
392                hv_store_ent(u, tmpk, tmpv, 0);
393             }
394          } else if ( num1==M_BYTE ) {
395             while ( buf<bufmax ) {
396                if (!(kn=_byte(&buf))) {
397                   if (__get_mode(&buf,&num2,&method2,&keys2,&values2)==M_END) {
398                      break;
399                   }
400                }
401                while ( kn>0 ) {
402                   if ( buf+ksize+vsize>bufmax ) {
403                      buf += ( ksize+vsize );
404                      break;
405                   }
406                   tmpk = newSVpv(buf, ksize); buf += ksize;
407                   tmpv = newSVpv(buf, vsize); buf += vsize;
408                   hv_store_ent(c, tmpv, tmpk, 0);
409                   hv_store_ent(u, tmpk, tmpv, 0);
410                   kn--;
411                }
412             }
413          }
414       } else if (method1==M_AKAV) {
415          /*
416           * First all keys, then all values
417           */
418          if ( PL_dowarn ) { warn ( "M_AKAV not supported!" ); }
419          return (0);
420       } else if (method1==M_PKV) {
421          /*
422           * Partial
423           */
424          if (num1==M_INF) {
425             /* no infinite mode */
426             if ( PL_dowarn ) { warn ( "M_INF not supported for M_PKV!" ); }
427             return (0);
428          }
429          while(buf<bufmax) {
430             U8 num3, method3, keys3, values3;
431             num3=num2; method3=method2; keys3=keys2; values3=values2;
432             if (!(kn = _byte(&buf))) {
433                if (__get_mode(&buf,&num2,&method2,&keys2,&values2)==M_END) {
434                   break;
435                }
436                continue;
437             }
438             switch (cs1b) {
439                case 1: kbegin = _byte(&buf); break;
440                case 2: kbegin = _word(&buf); break;
441                case 4: kbegin = _long(&buf); break;
442                default:
443                   if ( PL_dowarn ) { warn ( "Unknown element size!" ); }
444                   return (0);
445             }
446             while (kn>0) {
447                if (values3==M_CV) {
448                   /*
449                    * Partial, keys compressed, values compressed
450                    */
451                   SV* tmpk; U32 k;
452                   SV* tmpv; U32 v;
453                   U32 max;
454                   vn = _byte(&buf);
455                   if (!vn) {
456                      if(__get_mode(&buf,&num3,&method3,&keys3,&values3)==M_END){
457                         break;
458                      }
459                      continue;
460                   }
461                   if ((n1 != 1) || (n2 != 1)) {
462                      /*
463                       * n (n>1) characters cannot be mapped to one integer
464                       */
465                      if ( PL_dowarn ) { warn("Bad map file: count mismatch!"); }
466                      return (0);
467                   }
468                   switch (cs2b) {
469                      case 1: vbegin = _byte(&buf); break;
470                      case 2: vbegin = _word(&buf); break;
471                      case 4: vbegin = _long(&buf); break;
472                      default:
473                         if ( PL_dowarn ) { warn ( "Unknown element size!" ); }
474                         return (0);
475                   }
476 
477                   max = kbegin + vn;
478                   for (; kbegin<max; kbegin++, vbegin++) {
479 
480                      k = htonl(kbegin);
481                      tmpk = newSVpv((char *) &k + (4-cs1b), cs1b);
482 
483                      v = htonl(vbegin);
484                      tmpv = newSVpv((char *) &v + (4-cs2b), cs2b);
485 
486                      hv_store_ent(c, tmpv, tmpk, 0);
487                      hv_store_ent(u, tmpk, tmpv, 0);
488                   }
489                   kn-=vn;
490 
491                } else if (values3==M_CVn) {
492                   /*
493                    * Partial, keys compressed, values not compressed
494                    */
495                   U32 v;
496                   U32 vsize = n2*cs2b;
497                   SV* tmpk;
498                   SV* tmpv;
499                   if (n1 != 1) {
500                      if ( PL_dowarn ) { warn ( "Bad map file: mismatch 2!" ); }
501                      return (0);
502                   }
503                   while (kn--) {
504                      v = htonl(kbegin);
505                      tmpk = newSVpv((char *) &v + (4-cs1b), cs1b);
506                      tmpv = newSVpv(buf, vsize); buf += vsize;
507 
508                      hv_store_ent(u, tmpk, tmpv, 0);
509                      hv_store_ent(c, tmpv, tmpk, 0);
510 
511                      kbegin++;
512                   }
513                } else {
514                /*
515                 * Unknown value compression.
516                 */
517                   if ( PL_dowarn ) { warn ( "Unknown compression!" ); }
518                   return (0);
519                }
520             }
521          }
522       } else {
523          /*
524           * unknown method
525           */
526          if ( PL_dowarn ) { warn ( "Unknown method!" ); }
527          return (0);
528       }
529    }
530 
531    return (1);
532 }
533 
534 /*
535  *
536  * "Map.xs"
537  *
538  */
539 
540 MODULE = Unicode::Map	PACKAGE = Unicode::Map
541 
542 PROTOTYPES: DISABLE
543 
544 #
545 # $text = $Map -> reverse_unicode($text)
546 #
547 SV*
548 _reverse_unicode(Map, text)
549         SV*  Map
550         SV*  text
551 
552         PREINIT:
553         int i;
554         char c;
555         STRLEN len;
556         char* src;
557         char* dest;
558 
559         PPCODE:
560 	src = SvPV (text, len);
561 	if (PL_dowarn && (len % 2) != 0) {
562     	   warn("Bad string size!"); len--;
563 	}
564         /* Code below adapted from GAAS's Unicode::String */
565         if ( GIMME_V == G_VOID ) {
566            if ( SvREADONLY(text) ) {
567               die ( "reverse_unicode: string is readonly!" );
568            }
569            dest = src;
570         } else {
571            SV* dest_sv = sv_2mortal ( newSV(len+1) );
572            SvCUR_set ( dest_sv, len );
573            *SvEND ( dest_sv ) = 0;
574            SvPOK_on ( dest_sv );
575            PUSHs ( dest_sv );
576            dest = SvPVX ( dest_sv );
577         }
578         for ( ; len>=2; len-=2 ) {
579             char tmp = *src++;
580             *dest++ = *src++;
581             *dest++ = tmp;
582         }
583 
584 #
585 # $mapped_str = $Map -> _map_hash($string, \%mapping, $bytesize, offset, length)
586 #
587 # bytesize, offset, length in terms of bytes.
588 #
589 # bytesize gives the size of one character for this mapping.
590 #
591 SV*
592 _map_hash(Map, string, mappingR, bytesize, o, l)
593         SV*  Map
594         SV*  string
595         SV*  mappingR
596         SV*  bytesize
597         SV*  o
598         SV*  l
599 
600         PREINIT:
601         char* offset; U32 length; U16 bs;
602         char* smax;
603         HV*   mapping;
604         SV**  tmp;
605 
606         CODE:
607         bs = SvIV(bytesize);
608         __limit_ol (string, o, l, &offset, &length, bs);
609         smax = offset + length;
610 
611         RETVAL = newSV((length/bs+1)*2);
612         mapping = (HV *) SvRV(mappingR);
613 
614         for (; offset<smax; offset+=bs) {
615            if (tmp = hv_fetch(mapping, offset, bs, 0)) {
616               if ( SvOK(RETVAL) ) {
617                  sv_catsv(RETVAL, *tmp);
618               } else {
619                  sv_setsv(RETVAL, *tmp);
620               }
621            } else {
622               /* No mapping character found! */
623            }
624         }
625 
626         OUTPUT:
627 	   RETVAL
628 
629 
630 #
631 # $mapped_str = $Map -> _map_hashlist($string, [@{\%mapping}], [@{$bytesize}])
632 #
633 # bytesize gives the size of one character for this mapping.
634 #
635 SV*
636 _map_hashlist(Map, string, mappingRLR, bytesizeLR, o, l)
637         SV*  Map
638         SV*  string
639         SV*  mappingRLR
640         SV*  bytesizeLR
641         SV*  o
642         SV*  l
643 
644         PREINIT:
645         int j, max;
646         AV* mappingRL; HV* mapping;
647         AV* bytesizeL; int bytesize;
648         SV** tmp;
649         char* offset; U32 length; char* smax;
650 
651         CODE:
652         __limit_ol (string, o, l, &offset, &length, 1);
653         smax = offset + length;
654 
655         RETVAL = newSV((length+1)*2);
656 
657 	mappingRL = (AV *) SvRV(mappingRLR);
658         bytesizeL = (AV *) SvRV(bytesizeLR);
659         max = av_len(mappingRL);
660         if (max != av_len(bytesizeL)) {
661 	   warn("$#mappingRL != $#bytesizeL!");
662 	} else {
663            max++;
664            for (; offset<smax; ) {
665               for (j=0; j<=max; j++) {
666                  if (j==max) {
667                     /* No mapping character found!
668                      * How many bytes does this unknown character consume?
669                      * Sigh, assume 2.
670                      */
671                     offset += 2;
672                  } else {
673   	            if (tmp = av_fetch(mappingRL, j, 0)) {
674                        mapping = (HV *) SvRV((SV*) *tmp);
675                        if (tmp = av_fetch(bytesizeL, j, 0)) {
676                           bytesize = SvIV(*tmp);
677                           if (tmp = hv_fetch(mapping, offset, bytesize, 0)) {
678                              if ( SvOK(RETVAL) ) {
679                                 sv_catsv(RETVAL, *tmp);
680                              } else {
681                                 sv_setsv(RETVAL, *tmp);
682                              }
683                              offset+=bytesize;
684                              break;
685                           }
686                        }
687                     }
688                  }
689               }
690            }
691         }
692 
693         OUTPUT:
694 	   RETVAL
695 
696 
697 #
698 # status = $S->_read_binary_mapping($buf, $o, \%U, \%C);
699 #
700 SV*
701 _read_binary_mapping (MapS, bufS, oS, UR, CR)
702 	SV* MapS
703 	SV* bufS
704 	SV* oS
705 	SV* UR
706 	SV* CR
707 
708 	CODE:
709 	RETVAL = newSViv(__read_binary_mapping(bufS, oS, UR, CR));
710 
711 	OUTPUT:
712 	   RETVAL
713 
714 
715 #
716 # 0 || errornum = $S->_test ()
717 #
718 AV*
719 _system_test (void)
720 	CODE:
721 	RETVAL = __system_test();
722 	OUTPUT:
723 	RETVAL
724 
725