1# 2# $Id: GSM0338.pm,v 2.10 2021/05/24 10:56:53 dankogai Exp $ 3# 4package Encode::GSM0338; 5 6use strict; 7use warnings; 8use Carp; 9 10use vars qw($VERSION); 11$VERSION = do { my @r = ( q$Revision: 2.10 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; 12 13use Encode qw(:fallbacks); 14 15use parent qw(Encode::Encoding); 16__PACKAGE__->Define('gsm0338'); 17 18use utf8; 19 20# Mapping table according to 3GPP TS 23.038 version 16.0.0 Release 16 and ETSI TS 123 038 V16.0.0 (2020-07) 21# https://www.etsi.org/deliver/etsi_ts/123000_123099/123038/16.00.00_60/ts_123038v160000p.pdf (page 20 and 22) 22our %UNI2GSM = ( 23 "\x{000A}" => "\x0A", # LINE FEED 24 "\x{000C}" => "\x1B\x0A", # FORM FEED 25 "\x{000D}" => "\x0D", # CARRIAGE RETURN 26 "\x{0020}" => "\x20", # SPACE 27 "\x{0021}" => "\x21", # EXCLAMATION MARK 28 "\x{0022}" => "\x22", # QUOTATION MARK 29 "\x{0023}" => "\x23", # NUMBER SIGN 30 "\x{0024}" => "\x02", # DOLLAR SIGN 31 "\x{0025}" => "\x25", # PERCENT SIGN 32 "\x{0026}" => "\x26", # AMPERSAND 33 "\x{0027}" => "\x27", # APOSTROPHE 34 "\x{0028}" => "\x28", # LEFT PARENTHESIS 35 "\x{0029}" => "\x29", # RIGHT PARENTHESIS 36 "\x{002A}" => "\x2A", # ASTERISK 37 "\x{002B}" => "\x2B", # PLUS SIGN 38 "\x{002C}" => "\x2C", # COMMA 39 "\x{002D}" => "\x2D", # HYPHEN-MINUS 40 "\x{002E}" => "\x2E", # FULL STOP 41 "\x{002F}" => "\x2F", # SOLIDUS 42 "\x{0030}" => "\x30", # DIGIT ZERO 43 "\x{0031}" => "\x31", # DIGIT ONE 44 "\x{0032}" => "\x32", # DIGIT TWO 45 "\x{0033}" => "\x33", # DIGIT THREE 46 "\x{0034}" => "\x34", # DIGIT FOUR 47 "\x{0035}" => "\x35", # DIGIT FIVE 48 "\x{0036}" => "\x36", # DIGIT SIX 49 "\x{0037}" => "\x37", # DIGIT SEVEN 50 "\x{0038}" => "\x38", # DIGIT EIGHT 51 "\x{0039}" => "\x39", # DIGIT NINE 52 "\x{003A}" => "\x3A", # COLON 53 "\x{003B}" => "\x3B", # SEMICOLON 54 "\x{003C}" => "\x3C", # LESS-THAN SIGN 55 "\x{003D}" => "\x3D", # EQUALS SIGN 56 "\x{003E}" => "\x3E", # GREATER-THAN SIGN 57 "\x{003F}" => "\x3F", # QUESTION MARK 58 "\x{0040}" => "\x00", # COMMERCIAL AT 59 "\x{0041}" => "\x41", # LATIN CAPITAL LETTER A 60 "\x{0042}" => "\x42", # LATIN CAPITAL LETTER B 61 "\x{0043}" => "\x43", # LATIN CAPITAL LETTER C 62 "\x{0044}" => "\x44", # LATIN CAPITAL LETTER D 63 "\x{0045}" => "\x45", # LATIN CAPITAL LETTER E 64 "\x{0046}" => "\x46", # LATIN CAPITAL LETTER F 65 "\x{0047}" => "\x47", # LATIN CAPITAL LETTER G 66 "\x{0048}" => "\x48", # LATIN CAPITAL LETTER H 67 "\x{0049}" => "\x49", # LATIN CAPITAL LETTER I 68 "\x{004A}" => "\x4A", # LATIN CAPITAL LETTER J 69 "\x{004B}" => "\x4B", # LATIN CAPITAL LETTER K 70 "\x{004C}" => "\x4C", # LATIN CAPITAL LETTER L 71 "\x{004D}" => "\x4D", # LATIN CAPITAL LETTER M 72 "\x{004E}" => "\x4E", # LATIN CAPITAL LETTER N 73 "\x{004F}" => "\x4F", # LATIN CAPITAL LETTER O 74 "\x{0050}" => "\x50", # LATIN CAPITAL LETTER P 75 "\x{0051}" => "\x51", # LATIN CAPITAL LETTER Q 76 "\x{0052}" => "\x52", # LATIN CAPITAL LETTER R 77 "\x{0053}" => "\x53", # LATIN CAPITAL LETTER S 78 "\x{0054}" => "\x54", # LATIN CAPITAL LETTER T 79 "\x{0055}" => "\x55", # LATIN CAPITAL LETTER U 80 "\x{0056}" => "\x56", # LATIN CAPITAL LETTER V 81 "\x{0057}" => "\x57", # LATIN CAPITAL LETTER W 82 "\x{0058}" => "\x58", # LATIN CAPITAL LETTER X 83 "\x{0059}" => "\x59", # LATIN CAPITAL LETTER Y 84 "\x{005A}" => "\x5A", # LATIN CAPITAL LETTER Z 85 "\x{005B}" => "\x1B\x3C", # LEFT SQUARE BRACKET 86 "\x{005C}" => "\x1B\x2F", # REVERSE SOLIDUS 87 "\x{005D}" => "\x1B\x3E", # RIGHT SQUARE BRACKET 88 "\x{005E}" => "\x1B\x14", # CIRCUMFLEX ACCENT 89 "\x{005F}" => "\x11", # LOW LINE 90 "\x{0061}" => "\x61", # LATIN SMALL LETTER A 91 "\x{0062}" => "\x62", # LATIN SMALL LETTER B 92 "\x{0063}" => "\x63", # LATIN SMALL LETTER C 93 "\x{0064}" => "\x64", # LATIN SMALL LETTER D 94 "\x{0065}" => "\x65", # LATIN SMALL LETTER E 95 "\x{0066}" => "\x66", # LATIN SMALL LETTER F 96 "\x{0067}" => "\x67", # LATIN SMALL LETTER G 97 "\x{0068}" => "\x68", # LATIN SMALL LETTER H 98 "\x{0069}" => "\x69", # LATIN SMALL LETTER I 99 "\x{006A}" => "\x6A", # LATIN SMALL LETTER J 100 "\x{006B}" => "\x6B", # LATIN SMALL LETTER K 101 "\x{006C}" => "\x6C", # LATIN SMALL LETTER L 102 "\x{006D}" => "\x6D", # LATIN SMALL LETTER M 103 "\x{006E}" => "\x6E", # LATIN SMALL LETTER N 104 "\x{006F}" => "\x6F", # LATIN SMALL LETTER O 105 "\x{0070}" => "\x70", # LATIN SMALL LETTER P 106 "\x{0071}" => "\x71", # LATIN SMALL LETTER Q 107 "\x{0072}" => "\x72", # LATIN SMALL LETTER R 108 "\x{0073}" => "\x73", # LATIN SMALL LETTER S 109 "\x{0074}" => "\x74", # LATIN SMALL LETTER T 110 "\x{0075}" => "\x75", # LATIN SMALL LETTER U 111 "\x{0076}" => "\x76", # LATIN SMALL LETTER V 112 "\x{0077}" => "\x77", # LATIN SMALL LETTER W 113 "\x{0078}" => "\x78", # LATIN SMALL LETTER X 114 "\x{0079}" => "\x79", # LATIN SMALL LETTER Y 115 "\x{007A}" => "\x7A", # LATIN SMALL LETTER Z 116 "\x{007B}" => "\x1B\x28", # LEFT CURLY BRACKET 117 "\x{007C}" => "\x1B\x40", # VERTICAL LINE 118 "\x{007D}" => "\x1B\x29", # RIGHT CURLY BRACKET 119 "\x{007E}" => "\x1B\x3D", # TILDE 120 "\x{00A1}" => "\x40", # INVERTED EXCLAMATION MARK 121 "\x{00A3}" => "\x01", # POUND SIGN 122 "\x{00A4}" => "\x24", # CURRENCY SIGN 123 "\x{00A5}" => "\x03", # YEN SIGN 124 "\x{00A7}" => "\x5F", # SECTION SIGN 125 "\x{00BF}" => "\x60", # INVERTED QUESTION MARK 126 "\x{00C4}" => "\x5B", # LATIN CAPITAL LETTER A WITH DIAERESIS 127 "\x{00C5}" => "\x0E", # LATIN CAPITAL LETTER A WITH RING ABOVE 128 "\x{00C6}" => "\x1C", # LATIN CAPITAL LETTER AE 129 "\x{00C7}" => "\x09", # LATIN CAPITAL LETTER C WITH CEDILLA 130 "\x{00C9}" => "\x1F", # LATIN CAPITAL LETTER E WITH ACUTE 131 "\x{00D1}" => "\x5D", # LATIN CAPITAL LETTER N WITH TILDE 132 "\x{00D6}" => "\x5C", # LATIN CAPITAL LETTER O WITH DIAERESIS 133 "\x{00D8}" => "\x0B", # LATIN CAPITAL LETTER O WITH STROKE 134 "\x{00DC}" => "\x5E", # LATIN CAPITAL LETTER U WITH DIAERESIS 135 "\x{00DF}" => "\x1E", # LATIN SMALL LETTER SHARP S 136 "\x{00E0}" => "\x7F", # LATIN SMALL LETTER A WITH GRAVE 137 "\x{00E4}" => "\x7B", # LATIN SMALL LETTER A WITH DIAERESIS 138 "\x{00E5}" => "\x0F", # LATIN SMALL LETTER A WITH RING ABOVE 139 "\x{00E6}" => "\x1D", # LATIN SMALL LETTER AE 140 "\x{00E8}" => "\x04", # LATIN SMALL LETTER E WITH GRAVE 141 "\x{00E9}" => "\x05", # LATIN SMALL LETTER E WITH ACUTE 142 "\x{00EC}" => "\x07", # LATIN SMALL LETTER I WITH GRAVE 143 "\x{00F1}" => "\x7D", # LATIN SMALL LETTER N WITH TILDE 144 "\x{00F2}" => "\x08", # LATIN SMALL LETTER O WITH GRAVE 145 "\x{00F6}" => "\x7C", # LATIN SMALL LETTER O WITH DIAERESIS 146 "\x{00F8}" => "\x0C", # LATIN SMALL LETTER O WITH STROKE 147 "\x{00F9}" => "\x06", # LATIN SMALL LETTER U WITH GRAVE 148 "\x{00FC}" => "\x7E", # LATIN SMALL LETTER U WITH DIAERESIS 149 "\x{0393}" => "\x13", # GREEK CAPITAL LETTER GAMMA 150 "\x{0394}" => "\x10", # GREEK CAPITAL LETTER DELTA 151 "\x{0398}" => "\x19", # GREEK CAPITAL LETTER THETA 152 "\x{039B}" => "\x14", # GREEK CAPITAL LETTER LAMDA 153 "\x{039E}" => "\x1A", # GREEK CAPITAL LETTER XI 154 "\x{03A0}" => "\x16", # GREEK CAPITAL LETTER PI 155 "\x{03A3}" => "\x18", # GREEK CAPITAL LETTER SIGMA 156 "\x{03A6}" => "\x12", # GREEK CAPITAL LETTER PHI 157 "\x{03A8}" => "\x17", # GREEK CAPITAL LETTER PSI 158 "\x{03A9}" => "\x15", # GREEK CAPITAL LETTER OMEGA 159 "\x{20AC}" => "\x1B\x65", # EURO SIGN 160); 161our %GSM2UNI = reverse %UNI2GSM; 162our $ESC = "\x1b"; 163 164sub decode ($$;$) { 165 my ( $obj, $bytes, $chk ) = @_; 166 return undef unless defined $bytes; 167 my $str = substr( $bytes, 0, 0 ); # to propagate taintedness; 168 while ( length $bytes ) { 169 my $seq = ''; 170 my $c; 171 do { 172 $c = substr( $bytes, 0, 1, '' ); 173 $seq .= $c; 174 } while ( length $bytes and $c eq $ESC ); 175 my $u = 176 exists $GSM2UNI{$seq} ? $GSM2UNI{$seq} 177 : ( $chk && ref $chk eq 'CODE' ) ? $chk->( unpack 'C*', $seq ) 178 : "\x{FFFD}"; 179 if ( not exists $GSM2UNI{$seq} and $chk and not ref $chk ) { 180 if ( substr( $seq, 0, 1 ) eq $ESC 181 and ( $chk & Encode::STOP_AT_PARTIAL ) ) 182 { 183 $bytes .= $seq; 184 last; 185 } 186 croak join( '', map { sprintf "\\x%02X", $_ } unpack 'C*', $seq ) 187 . ' does not map to Unicode' 188 if $chk & Encode::DIE_ON_ERR; 189 carp join( '', map { sprintf "\\x%02X", $_ } unpack 'C*', $seq ) 190 . ' does not map to Unicode' 191 if $chk & Encode::WARN_ON_ERR; 192 if ( $chk & Encode::RETURN_ON_ERR ) { 193 $bytes .= $seq; 194 last; 195 } 196 } 197 $str .= $u; 198 } 199 $_[1] = $bytes if not ref $chk and $chk and !( $chk & Encode::LEAVE_SRC ); 200 return $str; 201} 202 203sub encode($$;$) { 204 my ( $obj, $str, $chk ) = @_; 205 return undef unless defined $str; 206 my $bytes = substr( $str, 0, 0 ); # to propagate taintedness 207 while ( length $str ) { 208 my $u = substr( $str, 0, 1, '' ); 209 my $c; 210 my $seq = 211 exists $UNI2GSM{$u} ? $UNI2GSM{$u} 212 : ( $chk && ref $chk eq 'CODE' ) ? $chk->( ord($u) ) 213 : $UNI2GSM{'?'}; 214 if ( not exists $UNI2GSM{$u} and $chk and not ref $chk ) { 215 croak sprintf( "\\x{%04x} does not map to %s", ord($u), $obj->name ) 216 if $chk & Encode::DIE_ON_ERR; 217 carp sprintf( "\\x{%04x} does not map to %s", ord($u), $obj->name ) 218 if $chk & Encode::WARN_ON_ERR; 219 if ( $chk & Encode::RETURN_ON_ERR ) { 220 $str .= $u; 221 last; 222 } 223 } 224 $bytes .= $seq; 225 } 226 $_[1] = $str if not ref $chk and $chk and !( $chk & Encode::LEAVE_SRC ); 227 return $bytes; 228} 229 2301; 231__END__ 232 233=head1 NAME 234 235Encode::GSM0338 -- ETSI GSM 03.38 Encoding 236 237=head1 SYNOPSIS 238 239 use Encode qw/encode decode/; 240 $gsm0338 = encode("gsm0338", $unicode); # loads Encode::GSM0338 implicitly 241 $unicode = decode("gsm0338", $gsm0338); # ditto 242 243=head1 DESCRIPTION 244 245GSM0338 is for GSM handsets. Though it shares alphanumerals with ASCII, 246control character ranges and other parts are mapped very differently, 247mainly to store Greek characters. There are also escape sequences 248(starting with 0x1B) to cover e.g. the Euro sign. 249 250This was once handled by L<Encode::Bytes> but because of all those 251unusual specifications, Encode 2.20 has relocated the support to 252this module. 253 254This module implements only I<GSM 7 bit Default Alphabet> and 255I<GSM 7 bit default alphabet extension table> according to standard 2563GPP TS 23.038 version 16. Therefore I<National Language Single Shift> 257and I<National Language Locking Shift> are not implemented nor supported. 258 259=head2 Septets 260 261This modules operates with octets (like any other Encode module) and not 262with packed septets (unlike other GSM standards). Therefore for processing 263binary SMS or parts of GSM TPDU payload (3GPP TS 23.040) it is needed to do 264conversion between octets and packed septets. For this purpose perl's C<pack> 265and C<unpack> functions may be useful: 266 267 $bytes = substr(pack('(b*)*', unpack '(A7)*', unpack 'b*', $septets), 0, $num_of_septets); 268 $unicode = decode('GSM0338', $bytes); 269 270 $bytes = encode('GSM0338', $unicode); 271 $septets = pack 'b*', join '', map { substr $_, 0, 7 } unpack '(A8)*', unpack 'b*', $bytes; 272 $num_of_septets = length $bytes; 273 274Please note that for correct decoding of packed septets it is required to 275know number of septets packed in binary buffer as binary buffer is always 276padded with zero bits and 7 zero bits represents character C<@>. Number 277of septets is also stored in TPDU payload when dealing with 3GPP TS 23.040. 278 279=head1 BUGS 280 281Encode::GSM0338 2.7 and older versions (part of Encode 3.06) incorrectly 282handled zero bytes (character C<@>). This was fixed in Encode::GSM0338 283version 2.8 (part of Encode 3.07). 284 285=head1 SEE ALSO 286 287L<3GPP TS 23.038|https://www.3gpp.org/dynareport/23038.htm> 288 289L<ETSI TS 123 038 V16.0.0 (2020-07)|https://www.etsi.org/deliver/etsi_ts/123000_123099/123038/16.00.00_60/ts_123038v160000p.pdf> 290 291L<Encode> 292 293=cut 294