1 /*
2 Data structures for encoding transformations.
3 
4 Perl works internally in either a native 'byte' encoding or
5 in UTF-8 encoded Unicode.  We have no immediate need for a "wchar_t"
6 representation. When we do we can use utf8_to_uv().
7 
8 Most character encodings are either simple byte mappings or
9 variable length multi-byte encodings. UTF-8 can be viewed as a
10 rather extreme case of the latter.
11 
12 So to solve an important part of perl's encode needs we need to solve the
13 "multi-byte -> multi-byte" case. The simple byte forms are then just degenerate
14 case. (Where one of multi-bytes will usually be UTF-8.)
15 
16 The other type of encoding is a shift encoding where a prefix sequence
17 determines what subsequent bytes mean. Such encodings have state.
18 
19 We also need to handle case where a character in one encoding has to be
20 represented as multiple characters in the other. e.g. letter+diacritic.
21 
22 The process can be considered as pseudo perl:
23 
24 my $dst = '';
25 while (length($src))
26  {
27   my $size    = src_count($src);
28   my $in_seq  = substr($src,0,$size,'');
29   my $out_seq = $s2d_hash{$in_seq};
30   if (defined $out_seq)
31    {
32     $dst .= $out_seq;
33    }
34   else
35    {
36     # an error condition
37    }
38  }
39 return $dst;
40 
41 That has the following components:
42  &src_count - a "rule" for how many bytes make up the next character in the
43               source.
44  %s2d_hash  - a mapping from input sequences to output sequences
45 
46 The problem with that scheme is that it does not allow the output
47 character repertoire to affect the characters considered from the
48 input.
49 
50 So we use a "trie" representation which can also be considered
51 a state machine:
52 
53 my $dst   = '';
54 my $seq   = \@s2d_seq;
55 my $next  = \@s2d_next;
56 while (length($src))
57  {
58   my $byte    = $substr($src,0,1,'');
59   my $out_seq = $seq->[$byte];
60   if (defined $out_seq)
61    {
62     $dst .= $out_seq;
63    }
64   else
65    {
66     # an error condition
67    }
68   ($next,$seq) = @$next->[$byte] if $next;
69  }
70 return $dst;
71 
72 There is now a pair of data structures to represent everything.
73 It is valid for output sequence at a particular point to
74 be defined but zero length, that just means "don't know yet".
75 For the single byte case there is no 'next' so new tables will be the same as
76 the original tables. For a multi-byte case a prefix byte will flip to the tables
77 for  the next page (adding nothing to the output), then the tables for the page
78 will provide the actual output and set tables back to original base page.
79 
80 This scheme can also handle shift encodings.
81 
82 A slight enhancement to the scheme also allows for look-ahead - if
83 we add a flag to re-add the removed byte to the source we could handle
84   a" -> U+00E4 (LATIN SMALL LETTER A WITH DIAERESIS)
85   ab -> a (and take b back please)
86 
87 */
88 
89 #define PERL_NO_GET_CONTEXT
90 #include <EXTERN.h>
91 #include <perl.h>
92 #include "encode.h"
93 
94 int
do_encode(const encpage_t * enc,const U8 * src,STRLEN * slen,U8 * dst,STRLEN dlen,STRLEN * dout,int approx,const U8 * term,STRLEN tlen)95 do_encode(const encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst,
96       STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen)
97 {
98     const U8 *s = src;
99     const U8 *send = s + *slen;
100     const U8 *last = s;
101     U8 *d = dst;
102     U8 *dend = d + dlen, *dlast = d;
103     int code = 0;
104     if (!dst)
105       return ENCODE_NOSPACE;
106     while (s < send) {
107         const encpage_t *e = enc;
108         U8 byte = *s;
109         while (byte > e->max)
110             e++;
111         if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) {
112             const U8 *cend = s + (e->slen & 0x7f);
113             if (cend <= send) {
114                 STRLEN n;
115                 if ((n = e->dlen)) {
116                     const U8 *out = e->seq + n * (byte - e->min);
117                     U8 *oend = d + n;
118                     if (dst) {
119                         if (oend <= dend) {
120                             while (d < oend)
121                                 *d++ = *out++;
122                         }
123                         else {
124                             /* Out of space */
125                             code = ENCODE_NOSPACE;
126                             break;
127                         }
128                     }
129                     else
130                         d = oend;
131                 }
132                 enc = e->next;
133                 s++;
134                 if (s == cend) {
135                     if (approx && (e->slen & 0x80))
136                         code = ENCODE_FALLBACK;
137                     last = s;
138                     if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) {
139                         code = ENCODE_FOUND_TERM;
140                         break;
141                     }
142                     dlast = d;
143                 }
144             }
145             else {
146                 /* partial source character */
147                 code = ENCODE_PARTIAL;
148                 break;
149             }
150         }
151         else {
152             /* Cannot represent */
153             code = ENCODE_NOREP;
154             break;
155         }
156     }
157     *slen = last - src;
158     *dout = d - dst;
159     return code;
160 }
161