1 /* 2 Data structures for encoding transformations. 3 4 Perl works internally in either a native 'byte' encoding or 5 in UTF-8 encoded Unicode. We have no immediate need for a "wchar_t" 6 representation. When we do we can use utf8_to_uv(). 7 8 Most character encodings are either simple byte mappings or 9 variable length multi-byte encodings. UTF-8 can be viewed as a 10 rather extreme case of the latter. 11 12 So to solve an important part of perl's encode needs we need to solve the 13 "multi-byte -> multi-byte" case. The simple byte forms are then just degenerate 14 case. (Where one of multi-bytes will usually be UTF-8.) 15 16 The other type of encoding is a shift encoding where a prefix sequence 17 determines what subsequent bytes mean. Such encodings have state. 18 19 We also need to handle case where a character in one encoding has to be 20 represented as multiple characters in the other. e.g. letter+diacritic. 21 22 The process can be considered as pseudo perl: 23 24 my $dst = ''; 25 while (length($src)) 26 { 27 my $size = $count($src); 28 my $in_seq = substr($src,0,$size,''); 29 my $out_seq = $s2d_hash{$in_seq}; 30 if (defined $out_seq) 31 { 32 $dst .= $out_seq; 33 } 34 else 35 { 36 # an error condition 37 } 38 } 39 return $dst; 40 41 That has the following components: 42 &src_count - a "rule" for how many bytes make up the next character in the 43 source. 44 %s2d_hash - a mapping from input sequences to output sequences 45 46 The problem with that scheme is that it does not allow the output 47 character repertoire to affect the characters considered from the 48 input. 49 50 So we use a "trie" representation which can also be considered 51 a state machine: 52 53 my $dst = ''; 54 my $seq = \@s2d_seq; 55 my $next = \@s2d_next; 56 while (length($src)) 57 { 58 my $byte = $substr($src,0,1,''); 59 my $out_seq = $seq->[$byte]; 60 if (defined $out_seq) 61 { 62 $dst .= $out_seq; 63 } 64 else 65 { 66 # an error condition 67 } 68 ($next,$seq) = @$next->[$byte] if $next; 69 } 70 return $dst; 71 72 There is now a pair of data structures to represent everything. 73 It is valid for output sequence at a particular point to 74 be defined but zero length, that just means "don't know yet". 75 For the single byte case there is no 'next' so new tables will be the same as 76 the original tables. For a multi-byte case a prefix byte will flip to the tables 77 for the next page (adding nothing to the output), then the tables for the page 78 will provide the actual output and set tables back to original base page. 79 80 This scheme can also handle shift encodings. 81 82 A slight enhancement to the scheme also allows for look-ahead - if 83 we add a flag to re-add the removed byte to the source we could handle 84 a" -> U+00E4 (LATIN SMALL LETTER A WITH DIAERESIS) 85 ab -> a (and take b back please) 86 87 */ 88 89 #include <EXTERN.h> 90 #include <perl.h> 91 #include "encode.h" 92 93 int 94 do_encode(const encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst, 95 STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen) 96 { 97 const U8 *s = src; 98 const U8 *send = s + *slen; 99 const U8 *last = s; 100 U8 *d = dst; 101 U8 *dend = d + dlen, *dlast = d; 102 int code = 0; 103 while (s < send) { 104 const encpage_t *e = enc; 105 U8 byte = *s; 106 while (byte > e->max) 107 e++; 108 if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) { 109 const U8 *cend = s + (e->slen & 0x7f); 110 if (cend <= send) { 111 STRLEN n; 112 if ((n = e->dlen)) { 113 const U8 *out = e->seq + n * (byte - e->min); 114 U8 *oend = d + n; 115 if (dst) { 116 if (oend <= dend) { 117 while (d < oend) 118 *d++ = *out++; 119 } 120 else { 121 /* Out of space */ 122 code = ENCODE_NOSPACE; 123 break; 124 } 125 } 126 else 127 d = oend; 128 } 129 enc = e->next; 130 s++; 131 if (s == cend) { 132 if (approx && (e->slen & 0x80)) 133 code = ENCODE_FALLBACK; 134 last = s; 135 if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) { 136 code = ENCODE_FOUND_TERM; 137 break; 138 } 139 dlast = d; 140 } 141 } 142 else { 143 /* partial source character */ 144 code = ENCODE_PARTIAL; 145 break; 146 } 147 } 148 else { 149 /* Cannot represent */ 150 code = ENCODE_NOREP; 151 break; 152 } 153 } 154 *slen = last - src; 155 *dout = d - dst; 156 return code; 157 } 158