1package Encode::TECkit;
2
3=head1 NAME
4
5Encode::TECkit - TECkit Encode interface
6
7=head1 DESCRIPTION
8
9This module interfaces with the TECkit processor to provide a Perl interface for
10data conversion.
11
12TECkit is a binary encoding converter designed to handle complex encoding conversions
13requiring multiple passes over the data and contextual data conversion. See the module
14Encode::UTR22 for a module that handles a textual language for this kind of conversion.
15That module contains a compiler that takes an extended UTR22 description and creates
16a binary control file for TECkit. Equally, TECkit contains its own language and compiler,
17but these are not written in Perl.
18
19There are two forms of Encode::TECkit (this is probably a bug). The first is a Perl
20object which passes methods along to the Encode::TECkit XS code. The difference is
21that the Perl object usually contains two binary Encode::TECkit objects. So, don't
22go calling XS methods on the pure Perl object (as returned by new).
23
24Notice that at this stage the interface is not there to use TECkit is a pure Unicode
25normalizer or encoding form converter. Use C<Unicode::Normalize> and (un)pack for that.
26
27=head1 METHODS
28
29=cut
30
31require DynaLoader;
32@ISA = qw(DynaLoader);
33
34$VERSION = 0.06;
35
36bootstrap Encode::TECkit;
37
38my (%forms) = ('nfc' => 0x100, 'nfd' => 0x200);
39
40=head2 Encode::TECkit->new($fname, %opts)
41
42This creates a new TECkit object. The usual form of the method call is to pass in
43the filename of the TECkit binary control file to use. In addition, the option:
44C<-form> may be used to specify which normal form to create when converting to
45UTF-8. This can take the values: C<nfc> or C<nfd>.
46
47It is possible to get an XS Encode::TECkit object using new(). To get this, use
48the following required options:
49
50=over 4
51
52=item -raw
53
54Set this to a non-zero value to get a pure XS object
55
56=item -forward
57
58if set, then mapping of this object is in the direction of forwards as specified
59in the TECkit binary file. By default this is assumed to by bytes to Unicode. if
60cleared, then the direction is the opposite (Unicode to bytes).
61
62=item -style
63
64This specifies what form the data should be converted to. The only sensible values
65are: 1 for bytes, 2 for UTF-8 and 3 for Unicode to Unicode translation.
66
67=back
68
69There are other non-required options to new:
70
71=over 4
72
73=item -form
74
75Takes the value C<nfc> or C<nfd> according to which form the data to be converted
76is in or should be in.
77
78=back
79
80=cut
81
82sub new
83{
84    my ($class, $fname, %opts) = @_;
85    my ($res) = {};
86    my ($ref, $hr, $form);
87
88    if ($opts{'-form'})
89    { $form = $forms{lc($opts{'-form'})}; }
90    else
91    { $form = 0; }
92
93    if ($opts{'-raw'})
94    {
95        ($res, $hr) = new_conv($fname, $opts{'-forward'}, $opts{'-style'} + $form);
96        return undef if $hr;
97    }
98    else
99    {
100        ($ref, $hr) = new_conv($fname, 1, 2 + $form);
101        return undef if $hr;
102        $res->{'decoder'} = $ref;
103        ($ref, $hr) = new_conv($fname, 0, 1);
104        return undef if $hr;
105        $res->{'encoder'} = $ref;
106        $res->{'form'} = $form;
107    }
108    bless $res, ref $class || $class;
109}
110
111
112sub new_scalar
113{
114    my ($class, $fdat, %opts) = @_;
115    my ($res) = {};
116    my ($ref, $hr, $form);
117
118    if ($opts{'-form'})
119    { $form = $forms{lc($opts{'-form'})}; }
120    if ($opts{'-raw'})
121    {
122        ($res, $hr) = new_conv_scalar($fdat, $opts{'-forward'}, $opts{'-style'} + $form);
123        return undef if $hr;
124    }
125    else
126    {
127        ($ref, $hr) = new_conv_scalar($fdat, 1, 2 + $form);
128        return undef if $hr;
129        $res->{'decoder'} = $ref;
130        ($ref, $hr) = new_conv_scalar($fdat, 0, 1);
131        return undef if $hr;
132        $res->{'encoder'} = $ref;
133        $res->{'form'} = $form;
134    }
135    bless $res, ref $class || $class;
136}
137
138
139=head2 $enc->decode($str, $check)
140
141Converts $str from bytes to Unicode. $check does nothing in this implementation.
142
143=cut
144
145sub decode
146{
147    my ($self, $str, $check) = @_;
148    my ($res, $hr);
149
150    $hr = 1;
151    $res = $self->{'decoder'}->convert($str, 2, $hr);
152    return $res;
153}
154
155
156=head2 $enc->encode($str, $check)
157
158Converts $str from Unicode to bytes. $check does nothing in this implementation
159and has no meaning (ignore it).
160
161=cut
162
163sub encode
164{
165    my ($self, $str, $check) = @_;
166    my ($res, $hr);
167
168    $hr = 1;
169    $res = $self->{'encoder'}->convert($str, 1, $hr);
170    return $res;
171}
172
173
174=head2 ($xs_enc, $hr) = Encode::TECkit::new_conv($fname, $forward, $style)
175
176XS function to create a new Encode::TECkit object. $fname specifies the filename of
177the TECkit binary control file to use. $forward indicates which direction to use
178the control file. $style is the encoding form of the output when using this mapping.
179The only sensible values are: 1 - bytes, 2 - UTF-8, and 0x102 for UTF-8 NFC
180and 0x202 for UTF-8 NFD.
181
182$hr is a result code which is 0 for success and non-zero for failure. See
183TECkit_Engine.h in the source for details of the meaning of this value
184
185
186=head2 $res = $xs_enc->convert($str, $style, $isComplete)
187
188XS function that converts a string according to the way the converter was setup. $str is the
189string to convert. $style indicates the resulting encoding format: 1 - bytes,
1902 - UTF-8. $style is used to set the appropriate bits in the string to
191indicate the encoding to Perl. $isComplete indicates whether the string is
192a complete string and so no further flushing is needed. It also acts as a return
193value (and so must be a valid lvalue). The return value is the $hr for the
194conversion.
195
196
197=head2 $res = $xs_enc->flush($style, $hr)
198
199XS function that finishes off a conversion with the given $style value. Notice that $hr is
200merely a place holder for the returned $hr, so must be a valid lvalue. It's
201value has no meaning.
202
203=cut
204