1#
2# $Id: GSM0338.pm,v 2.10 2021/05/24 10:56:53 dankogai Exp $
3#
4package Encode::GSM0338;
5
6use strict;
7use warnings;
8use Carp;
9
10use vars qw($VERSION);
11$VERSION = do { my @r = ( q$Revision: 2.10 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
12
13use Encode qw(:fallbacks);
14
15use parent qw(Encode::Encoding);
16__PACKAGE__->Define('gsm0338');
17
18use utf8;
19
20# Mapping table according to 3GPP TS 23.038 version 16.0.0 Release 16 and ETSI TS 123 038 V16.0.0 (2020-07)
21# https://www.etsi.org/deliver/etsi_ts/123000_123099/123038/16.00.00_60/ts_123038v160000p.pdf (page 20 and 22)
22our %UNI2GSM = (
23    "\x{000A}" => "\x0A",        # LINE FEED
24    "\x{000C}" => "\x1B\x0A",    # FORM FEED
25    "\x{000D}" => "\x0D",        # CARRIAGE RETURN
26    "\x{0020}" => "\x20",        # SPACE
27    "\x{0021}" => "\x21",        # EXCLAMATION MARK
28    "\x{0022}" => "\x22",        # QUOTATION MARK
29    "\x{0023}" => "\x23",        # NUMBER SIGN
30    "\x{0024}" => "\x02",        # DOLLAR SIGN
31    "\x{0025}" => "\x25",        # PERCENT SIGN
32    "\x{0026}" => "\x26",        # AMPERSAND
33    "\x{0027}" => "\x27",        # APOSTROPHE
34    "\x{0028}" => "\x28",        # LEFT PARENTHESIS
35    "\x{0029}" => "\x29",        # RIGHT PARENTHESIS
36    "\x{002A}" => "\x2A",        # ASTERISK
37    "\x{002B}" => "\x2B",        # PLUS SIGN
38    "\x{002C}" => "\x2C",        # COMMA
39    "\x{002D}" => "\x2D",        # HYPHEN-MINUS
40    "\x{002E}" => "\x2E",        # FULL STOP
41    "\x{002F}" => "\x2F",        # SOLIDUS
42    "\x{0030}" => "\x30",        # DIGIT ZERO
43    "\x{0031}" => "\x31",        # DIGIT ONE
44    "\x{0032}" => "\x32",        # DIGIT TWO
45    "\x{0033}" => "\x33",        # DIGIT THREE
46    "\x{0034}" => "\x34",        # DIGIT FOUR
47    "\x{0035}" => "\x35",        # DIGIT FIVE
48    "\x{0036}" => "\x36",        # DIGIT SIX
49    "\x{0037}" => "\x37",        # DIGIT SEVEN
50    "\x{0038}" => "\x38",        # DIGIT EIGHT
51    "\x{0039}" => "\x39",        # DIGIT NINE
52    "\x{003A}" => "\x3A",        # COLON
53    "\x{003B}" => "\x3B",        # SEMICOLON
54    "\x{003C}" => "\x3C",        # LESS-THAN SIGN
55    "\x{003D}" => "\x3D",        # EQUALS SIGN
56    "\x{003E}" => "\x3E",        # GREATER-THAN SIGN
57    "\x{003F}" => "\x3F",        # QUESTION MARK
58    "\x{0040}" => "\x00",        # COMMERCIAL AT
59    "\x{0041}" => "\x41",        # LATIN CAPITAL LETTER A
60    "\x{0042}" => "\x42",        # LATIN CAPITAL LETTER B
61    "\x{0043}" => "\x43",        # LATIN CAPITAL LETTER C
62    "\x{0044}" => "\x44",        # LATIN CAPITAL LETTER D
63    "\x{0045}" => "\x45",        # LATIN CAPITAL LETTER E
64    "\x{0046}" => "\x46",        # LATIN CAPITAL LETTER F
65    "\x{0047}" => "\x47",        # LATIN CAPITAL LETTER G
66    "\x{0048}" => "\x48",        # LATIN CAPITAL LETTER H
67    "\x{0049}" => "\x49",        # LATIN CAPITAL LETTER I
68    "\x{004A}" => "\x4A",        # LATIN CAPITAL LETTER J
69    "\x{004B}" => "\x4B",        # LATIN CAPITAL LETTER K
70    "\x{004C}" => "\x4C",        # LATIN CAPITAL LETTER L
71    "\x{004D}" => "\x4D",        # LATIN CAPITAL LETTER M
72    "\x{004E}" => "\x4E",        # LATIN CAPITAL LETTER N
73    "\x{004F}" => "\x4F",        # LATIN CAPITAL LETTER O
74    "\x{0050}" => "\x50",        # LATIN CAPITAL LETTER P
75    "\x{0051}" => "\x51",        # LATIN CAPITAL LETTER Q
76    "\x{0052}" => "\x52",        # LATIN CAPITAL LETTER R
77    "\x{0053}" => "\x53",        # LATIN CAPITAL LETTER S
78    "\x{0054}" => "\x54",        # LATIN CAPITAL LETTER T
79    "\x{0055}" => "\x55",        # LATIN CAPITAL LETTER U
80    "\x{0056}" => "\x56",        # LATIN CAPITAL LETTER V
81    "\x{0057}" => "\x57",        # LATIN CAPITAL LETTER W
82    "\x{0058}" => "\x58",        # LATIN CAPITAL LETTER X
83    "\x{0059}" => "\x59",        # LATIN CAPITAL LETTER Y
84    "\x{005A}" => "\x5A",        # LATIN CAPITAL LETTER Z
85    "\x{005B}" => "\x1B\x3C",    # LEFT SQUARE BRACKET
86    "\x{005C}" => "\x1B\x2F",    # REVERSE SOLIDUS
87    "\x{005D}" => "\x1B\x3E",    # RIGHT SQUARE BRACKET
88    "\x{005E}" => "\x1B\x14",    # CIRCUMFLEX ACCENT
89    "\x{005F}" => "\x11",        # LOW LINE
90    "\x{0061}" => "\x61",        # LATIN SMALL LETTER A
91    "\x{0062}" => "\x62",        # LATIN SMALL LETTER B
92    "\x{0063}" => "\x63",        # LATIN SMALL LETTER C
93    "\x{0064}" => "\x64",        # LATIN SMALL LETTER D
94    "\x{0065}" => "\x65",        # LATIN SMALL LETTER E
95    "\x{0066}" => "\x66",        # LATIN SMALL LETTER F
96    "\x{0067}" => "\x67",        # LATIN SMALL LETTER G
97    "\x{0068}" => "\x68",        # LATIN SMALL LETTER H
98    "\x{0069}" => "\x69",        # LATIN SMALL LETTER I
99    "\x{006A}" => "\x6A",        # LATIN SMALL LETTER J
100    "\x{006B}" => "\x6B",        # LATIN SMALL LETTER K
101    "\x{006C}" => "\x6C",        # LATIN SMALL LETTER L
102    "\x{006D}" => "\x6D",        # LATIN SMALL LETTER M
103    "\x{006E}" => "\x6E",        # LATIN SMALL LETTER N
104    "\x{006F}" => "\x6F",        # LATIN SMALL LETTER O
105    "\x{0070}" => "\x70",        # LATIN SMALL LETTER P
106    "\x{0071}" => "\x71",        # LATIN SMALL LETTER Q
107    "\x{0072}" => "\x72",        # LATIN SMALL LETTER R
108    "\x{0073}" => "\x73",        # LATIN SMALL LETTER S
109    "\x{0074}" => "\x74",        # LATIN SMALL LETTER T
110    "\x{0075}" => "\x75",        # LATIN SMALL LETTER U
111    "\x{0076}" => "\x76",        # LATIN SMALL LETTER V
112    "\x{0077}" => "\x77",        # LATIN SMALL LETTER W
113    "\x{0078}" => "\x78",        # LATIN SMALL LETTER X
114    "\x{0079}" => "\x79",        # LATIN SMALL LETTER Y
115    "\x{007A}" => "\x7A",        # LATIN SMALL LETTER Z
116    "\x{007B}" => "\x1B\x28",    # LEFT CURLY BRACKET
117    "\x{007C}" => "\x1B\x40",    # VERTICAL LINE
118    "\x{007D}" => "\x1B\x29",    # RIGHT CURLY BRACKET
119    "\x{007E}" => "\x1B\x3D",    # TILDE
120    "\x{00A1}" => "\x40",        # INVERTED EXCLAMATION MARK
121    "\x{00A3}" => "\x01",        # POUND SIGN
122    "\x{00A4}" => "\x24",        # CURRENCY SIGN
123    "\x{00A5}" => "\x03",        # YEN SIGN
124    "\x{00A7}" => "\x5F",        # SECTION SIGN
125    "\x{00BF}" => "\x60",        # INVERTED QUESTION MARK
126    "\x{00C4}" => "\x5B",        # LATIN CAPITAL LETTER A WITH DIAERESIS
127    "\x{00C5}" => "\x0E",        # LATIN CAPITAL LETTER A WITH RING ABOVE
128    "\x{00C6}" => "\x1C",        # LATIN CAPITAL LETTER AE
129    "\x{00C7}" => "\x09",        # LATIN CAPITAL LETTER C WITH CEDILLA
130    "\x{00C9}" => "\x1F",        # LATIN CAPITAL LETTER E WITH ACUTE
131    "\x{00D1}" => "\x5D",        # LATIN CAPITAL LETTER N WITH TILDE
132    "\x{00D6}" => "\x5C",        # LATIN CAPITAL LETTER O WITH DIAERESIS
133    "\x{00D8}" => "\x0B",        # LATIN CAPITAL LETTER O WITH STROKE
134    "\x{00DC}" => "\x5E",        # LATIN CAPITAL LETTER U WITH DIAERESIS
135    "\x{00DF}" => "\x1E",        # LATIN SMALL LETTER SHARP S
136    "\x{00E0}" => "\x7F",        # LATIN SMALL LETTER A WITH GRAVE
137    "\x{00E4}" => "\x7B",        # LATIN SMALL LETTER A WITH DIAERESIS
138    "\x{00E5}" => "\x0F",        # LATIN SMALL LETTER A WITH RING ABOVE
139    "\x{00E6}" => "\x1D",        # LATIN SMALL LETTER AE
140    "\x{00E8}" => "\x04",        # LATIN SMALL LETTER E WITH GRAVE
141    "\x{00E9}" => "\x05",        # LATIN SMALL LETTER E WITH ACUTE
142    "\x{00EC}" => "\x07",        # LATIN SMALL LETTER I WITH GRAVE
143    "\x{00F1}" => "\x7D",        # LATIN SMALL LETTER N WITH TILDE
144    "\x{00F2}" => "\x08",        # LATIN SMALL LETTER O WITH GRAVE
145    "\x{00F6}" => "\x7C",        # LATIN SMALL LETTER O WITH DIAERESIS
146    "\x{00F8}" => "\x0C",        # LATIN SMALL LETTER O WITH STROKE
147    "\x{00F9}" => "\x06",        # LATIN SMALL LETTER U WITH GRAVE
148    "\x{00FC}" => "\x7E",        # LATIN SMALL LETTER U WITH DIAERESIS
149    "\x{0393}" => "\x13",        # GREEK CAPITAL LETTER GAMMA
150    "\x{0394}" => "\x10",        # GREEK CAPITAL LETTER DELTA
151    "\x{0398}" => "\x19",        # GREEK CAPITAL LETTER THETA
152    "\x{039B}" => "\x14",        # GREEK CAPITAL LETTER LAMDA
153    "\x{039E}" => "\x1A",        # GREEK CAPITAL LETTER XI
154    "\x{03A0}" => "\x16",        # GREEK CAPITAL LETTER PI
155    "\x{03A3}" => "\x18",        # GREEK CAPITAL LETTER SIGMA
156    "\x{03A6}" => "\x12",        # GREEK CAPITAL LETTER PHI
157    "\x{03A8}" => "\x17",        # GREEK CAPITAL LETTER PSI
158    "\x{03A9}" => "\x15",        # GREEK CAPITAL LETTER OMEGA
159    "\x{20AC}" => "\x1B\x65",    # EURO SIGN
160);
161our %GSM2UNI = reverse %UNI2GSM;
162our $ESC     = "\x1b";
163
164sub decode ($$;$) {
165    my ( $obj, $bytes, $chk ) = @_;
166    return undef unless defined $bytes;
167    my $str = substr( $bytes, 0, 0 );    # to propagate taintedness;
168    while ( length $bytes ) {
169        my $seq = '';
170        my $c;
171        do {
172            $c = substr( $bytes, 0, 1, '' );
173            $seq .= $c;
174        } while ( length $bytes and $c eq $ESC );
175        my $u =
176            exists $GSM2UNI{$seq}          ? $GSM2UNI{$seq}
177          : ( $chk && ref $chk eq 'CODE' ) ? $chk->( unpack 'C*', $seq )
178          :                                  "\x{FFFD}";
179        if ( not exists $GSM2UNI{$seq} and $chk and not ref $chk ) {
180            if ( substr( $seq, 0, 1 ) eq $ESC
181                and ( $chk & Encode::STOP_AT_PARTIAL ) )
182            {
183                $bytes .= $seq;
184                last;
185            }
186            croak join( '', map { sprintf "\\x%02X", $_ } unpack 'C*', $seq )
187              . ' does not map to Unicode'
188              if $chk & Encode::DIE_ON_ERR;
189            carp join( '', map { sprintf "\\x%02X", $_ } unpack 'C*', $seq )
190              . ' does not map to Unicode'
191              if $chk & Encode::WARN_ON_ERR;
192            if ( $chk & Encode::RETURN_ON_ERR ) {
193                $bytes .= $seq;
194                last;
195            }
196        }
197        $str .= $u;
198    }
199    $_[1] = $bytes if not ref $chk and $chk and !( $chk & Encode::LEAVE_SRC );
200    return $str;
201}
202
203sub encode($$;$) {
204    my ( $obj, $str, $chk ) = @_;
205    return undef unless defined $str;
206    my $bytes = substr( $str, 0, 0 );    # to propagate taintedness
207    while ( length $str ) {
208        my $u = substr( $str, 0, 1, '' );
209        my $c;
210        my $seq =
211            exists $UNI2GSM{$u}            ? $UNI2GSM{$u}
212          : ( $chk && ref $chk eq 'CODE' ) ? $chk->( ord($u) )
213          :                                  $UNI2GSM{'?'};
214        if ( not exists $UNI2GSM{$u} and $chk and not ref $chk ) {
215            croak sprintf( "\\x{%04x} does not map to %s", ord($u), $obj->name )
216              if $chk & Encode::DIE_ON_ERR;
217            carp sprintf( "\\x{%04x} does not map to %s", ord($u), $obj->name )
218              if $chk & Encode::WARN_ON_ERR;
219            if ( $chk & Encode::RETURN_ON_ERR ) {
220                $str .= $u;
221                last;
222            }
223        }
224        $bytes .= $seq;
225    }
226    $_[1] = $str if not ref $chk and $chk and !( $chk & Encode::LEAVE_SRC );
227    return $bytes;
228}
229
2301;
231__END__
232
233=head1 NAME
234
235Encode::GSM0338 -- ETSI GSM 03.38 Encoding
236
237=head1 SYNOPSIS
238
239  use Encode qw/encode decode/;
240  $gsm0338 = encode("gsm0338", $unicode); # loads Encode::GSM0338 implicitly
241  $unicode = decode("gsm0338", $gsm0338); # ditto
242
243=head1 DESCRIPTION
244
245GSM0338 is for GSM handsets. Though it shares alphanumerals with ASCII,
246control character ranges and other parts are mapped very differently,
247mainly to store Greek characters.  There are also escape sequences
248(starting with 0x1B) to cover e.g. the Euro sign.
249
250This was once handled by L<Encode::Bytes> but because of all those
251unusual specifications, Encode 2.20 has relocated the support to
252this module.
253
254This module implements only I<GSM 7 bit Default Alphabet> and
255I<GSM 7 bit default alphabet extension table> according to standard
2563GPP TS 23.038 version 16. Therefore I<National Language Single Shift>
257and I<National Language Locking Shift> are not implemented nor supported.
258
259=head2 Septets
260
261This modules operates with octets (like any other Encode module) and not
262with packed septets (unlike other GSM standards). Therefore for processing
263binary SMS or parts of GSM TPDU payload (3GPP TS 23.040) it is needed to do
264conversion between octets and packed septets. For this purpose perl's C<pack>
265and C<unpack> functions may be useful:
266
267  $bytes = substr(pack('(b*)*', unpack '(A7)*', unpack 'b*', $septets), 0, $num_of_septets);
268  $unicode = decode('GSM0338', $bytes);
269
270  $bytes = encode('GSM0338', $unicode);
271  $septets = pack 'b*', join '', map { substr $_, 0, 7 } unpack '(A8)*', unpack 'b*', $bytes;
272  $num_of_septets = length $bytes;
273
274Please note that for correct decoding of packed septets it is required to
275know number of septets packed in binary buffer as binary buffer is always
276padded with zero bits and 7 zero bits represents character C<@>. Number
277of septets is also stored in TPDU payload when dealing with 3GPP TS 23.040.
278
279=head1 BUGS
280
281Encode::GSM0338 2.7 and older versions (part of Encode 3.06) incorrectly
282handled zero bytes (character C<@>). This was fixed in Encode::GSM0338
283version 2.8 (part of Encode 3.07).
284
285=head1 SEE ALSO
286
287L<3GPP TS 23.038|https://www.3gpp.org/dynareport/23038.htm>
288
289L<ETSI TS 123 038 V16.0.0 (2020-07)|https://www.etsi.org/deliver/etsi_ts/123000_123099/123038/16.00.00_60/ts_123038v160000p.pdf>
290
291L<Encode>
292
293=cut
294