1# 2# $Id: GSM0338.pm,v 2.5 2013/09/14 07:51:59 dankogai Exp $ 3# 4package Encode::GSM0338; 5 6use strict; 7use warnings; 8use Carp; 9 10use vars qw($VERSION); 11$VERSION = do { my @r = ( q$Revision: 2.5 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r }; 12 13use Encode qw(:fallbacks); 14 15use parent qw(Encode::Encoding); 16__PACKAGE__->Define('gsm0338'); 17 18sub needs_lines { 1 } 19sub perlio_ok { 0 } 20 21use utf8; 22our %UNI2GSM = ( 23 "\x{0040}" => "\x00", # COMMERCIAL AT 24 "\x{000A}" => "\x0A", # LINE FEED 25 "\x{000C}" => "\x1B\x0A", # FORM FEED 26 "\x{000D}" => "\x0D", # CARRIAGE RETURN 27 "\x{0020}" => "\x20", # SPACE 28 "\x{0021}" => "\x21", # EXCLAMATION MARK 29 "\x{0022}" => "\x22", # QUOTATION MARK 30 "\x{0023}" => "\x23", # NUMBER SIGN 31 "\x{0024}" => "\x02", # DOLLAR SIGN 32 "\x{0025}" => "\x25", # PERCENT SIGN 33 "\x{0026}" => "\x26", # AMPERSAND 34 "\x{0027}" => "\x27", # APOSTROPHE 35 "\x{0028}" => "\x28", # LEFT PARENTHESIS 36 "\x{0029}" => "\x29", # RIGHT PARENTHESIS 37 "\x{002A}" => "\x2A", # ASTERISK 38 "\x{002B}" => "\x2B", # PLUS SIGN 39 "\x{002C}" => "\x2C", # COMMA 40 "\x{002D}" => "\x2D", # HYPHEN-MINUS 41 "\x{002E}" => "\x2E", # FULL STOP 42 "\x{002F}" => "\x2F", # SOLIDUS 43 "\x{0030}" => "\x30", # DIGIT ZERO 44 "\x{0031}" => "\x31", # DIGIT ONE 45 "\x{0032}" => "\x32", # DIGIT TWO 46 "\x{0033}" => "\x33", # DIGIT THREE 47 "\x{0034}" => "\x34", # DIGIT FOUR 48 "\x{0035}" => "\x35", # DIGIT FIVE 49 "\x{0036}" => "\x36", # DIGIT SIX 50 "\x{0037}" => "\x37", # DIGIT SEVEN 51 "\x{0038}" => "\x38", # DIGIT EIGHT 52 "\x{0039}" => "\x39", # DIGIT NINE 53 "\x{003A}" => "\x3A", # COLON 54 "\x{003B}" => "\x3B", # SEMICOLON 55 "\x{003C}" => "\x3C", # LESS-THAN SIGN 56 "\x{003D}" => "\x3D", # EQUALS SIGN 57 "\x{003E}" => "\x3E", # GREATER-THAN SIGN 58 "\x{003F}" => "\x3F", # QUESTION MARK 59 "\x{0041}" => "\x41", # LATIN CAPITAL LETTER A 60 "\x{0042}" => "\x42", # LATIN CAPITAL LETTER B 61 "\x{0043}" => "\x43", # LATIN CAPITAL LETTER C 62 "\x{0044}" => "\x44", # LATIN CAPITAL LETTER D 63 "\x{0045}" => "\x45", # LATIN CAPITAL LETTER E 64 "\x{0046}" => "\x46", # LATIN CAPITAL LETTER F 65 "\x{0047}" => "\x47", # LATIN CAPITAL LETTER G 66 "\x{0048}" => "\x48", # LATIN CAPITAL LETTER H 67 "\x{0049}" => "\x49", # LATIN CAPITAL LETTER I 68 "\x{004A}" => "\x4A", # LATIN CAPITAL LETTER J 69 "\x{004B}" => "\x4B", # LATIN CAPITAL LETTER K 70 "\x{004C}" => "\x4C", # LATIN CAPITAL LETTER L 71 "\x{004D}" => "\x4D", # LATIN CAPITAL LETTER M 72 "\x{004E}" => "\x4E", # LATIN CAPITAL LETTER N 73 "\x{004F}" => "\x4F", # LATIN CAPITAL LETTER O 74 "\x{0050}" => "\x50", # LATIN CAPITAL LETTER P 75 "\x{0051}" => "\x51", # LATIN CAPITAL LETTER Q 76 "\x{0052}" => "\x52", # LATIN CAPITAL LETTER R 77 "\x{0053}" => "\x53", # LATIN CAPITAL LETTER S 78 "\x{0054}" => "\x54", # LATIN CAPITAL LETTER T 79 "\x{0055}" => "\x55", # LATIN CAPITAL LETTER U 80 "\x{0056}" => "\x56", # LATIN CAPITAL LETTER V 81 "\x{0057}" => "\x57", # LATIN CAPITAL LETTER W 82 "\x{0058}" => "\x58", # LATIN CAPITAL LETTER X 83 "\x{0059}" => "\x59", # LATIN CAPITAL LETTER Y 84 "\x{005A}" => "\x5A", # LATIN CAPITAL LETTER Z 85 "\x{005F}" => "\x11", # LOW LINE 86 "\x{0061}" => "\x61", # LATIN SMALL LETTER A 87 "\x{0062}" => "\x62", # LATIN SMALL LETTER B 88 "\x{0063}" => "\x63", # LATIN SMALL LETTER C 89 "\x{0064}" => "\x64", # LATIN SMALL LETTER D 90 "\x{0065}" => "\x65", # LATIN SMALL LETTER E 91 "\x{0066}" => "\x66", # LATIN SMALL LETTER F 92 "\x{0067}" => "\x67", # LATIN SMALL LETTER G 93 "\x{0068}" => "\x68", # LATIN SMALL LETTER H 94 "\x{0069}" => "\x69", # LATIN SMALL LETTER I 95 "\x{006A}" => "\x6A", # LATIN SMALL LETTER J 96 "\x{006B}" => "\x6B", # LATIN SMALL LETTER K 97 "\x{006C}" => "\x6C", # LATIN SMALL LETTER L 98 "\x{006D}" => "\x6D", # LATIN SMALL LETTER M 99 "\x{006E}" => "\x6E", # LATIN SMALL LETTER N 100 "\x{006F}" => "\x6F", # LATIN SMALL LETTER O 101 "\x{0070}" => "\x70", # LATIN SMALL LETTER P 102 "\x{0071}" => "\x71", # LATIN SMALL LETTER Q 103 "\x{0072}" => "\x72", # LATIN SMALL LETTER R 104 "\x{0073}" => "\x73", # LATIN SMALL LETTER S 105 "\x{0074}" => "\x74", # LATIN SMALL LETTER T 106 "\x{0075}" => "\x75", # LATIN SMALL LETTER U 107 "\x{0076}" => "\x76", # LATIN SMALL LETTER V 108 "\x{0077}" => "\x77", # LATIN SMALL LETTER W 109 "\x{0078}" => "\x78", # LATIN SMALL LETTER X 110 "\x{0079}" => "\x79", # LATIN SMALL LETTER Y 111 "\x{007A}" => "\x7A", # LATIN SMALL LETTER Z 112 "\x{000C}" => "\x1B\x0A", # FORM FEED 113 "\x{005B}" => "\x1B\x3C", # LEFT SQUARE BRACKET 114 "\x{005C}" => "\x1B\x2F", # REVERSE SOLIDUS 115 "\x{005D}" => "\x1B\x3E", # RIGHT SQUARE BRACKET 116 "\x{005E}" => "\x1B\x14", # CIRCUMFLEX ACCENT 117 "\x{007B}" => "\x1B\x28", # LEFT CURLY BRACKET 118 "\x{007C}" => "\x1B\x40", # VERTICAL LINE 119 "\x{007D}" => "\x1B\x29", # RIGHT CURLY BRACKET 120 "\x{007E}" => "\x1B\x3D", # TILDE 121 "\x{00A0}" => "\x1B", # NO-BREAK SPACE 122 "\x{00A1}" => "\x40", # INVERTED EXCLAMATION MARK 123 "\x{00A3}" => "\x01", # POUND SIGN 124 "\x{00A4}" => "\x24", # CURRENCY SIGN 125 "\x{00A5}" => "\x03", # YEN SIGN 126 "\x{00A7}" => "\x5F", # SECTION SIGN 127 "\x{00BF}" => "\x60", # INVERTED QUESTION MARK 128 "\x{00C4}" => "\x5B", # LATIN CAPITAL LETTER A WITH DIAERESIS 129 "\x{00C5}" => "\x0E", # LATIN CAPITAL LETTER A WITH RING ABOVE 130 "\x{00C6}" => "\x1C", # LATIN CAPITAL LETTER AE 131 "\x{00C9}" => "\x1F", # LATIN CAPITAL LETTER E WITH ACUTE 132 "\x{00D1}" => "\x5D", # LATIN CAPITAL LETTER N WITH TILDE 133 "\x{00D6}" => "\x5C", # LATIN CAPITAL LETTER O WITH DIAERESIS 134 "\x{00D8}" => "\x0B", # LATIN CAPITAL LETTER O WITH STROKE 135 "\x{00DC}" => "\x5E", # LATIN CAPITAL LETTER U WITH DIAERESIS 136 "\x{00DF}" => "\x1E", # LATIN SMALL LETTER SHARP S 137 "\x{00E0}" => "\x7F", # LATIN SMALL LETTER A WITH GRAVE 138 "\x{00E4}" => "\x7B", # LATIN SMALL LETTER A WITH DIAERESIS 139 "\x{00E5}" => "\x0F", # LATIN SMALL LETTER A WITH RING ABOVE 140 "\x{00E6}" => "\x1D", # LATIN SMALL LETTER AE 141 #"\x{00E7}" => "\x09", # LATIN SMALL LETTER C WITH CEDILLA 142 "\x{00C7}" => "\x09", # LATIN CAPITAL LETTER C WITH CEDILLA 143 "\x{00E8}" => "\x04", # LATIN SMALL LETTER E WITH GRAVE 144 "\x{00E9}" => "\x05", # LATIN SMALL LETTER E WITH ACUTE 145 "\x{00EC}" => "\x07", # LATIN SMALL LETTER I WITH GRAVE 146 "\x{00F1}" => "\x7D", # LATIN SMALL LETTER N WITH TILDE 147 "\x{00F2}" => "\x08", # LATIN SMALL LETTER O WITH GRAVE 148 "\x{00F6}" => "\x7C", # LATIN SMALL LETTER O WITH DIAERESIS 149 "\x{00F8}" => "\x0C", # LATIN SMALL LETTER O WITH STROKE 150 "\x{00F9}" => "\x06", # LATIN SMALL LETTER U WITH GRAVE 151 "\x{00FC}" => "\x7E", # LATIN SMALL LETTER U WITH DIAERESIS 152 "\x{0393}" => "\x13", # GREEK CAPITAL LETTER GAMMA 153 "\x{0394}" => "\x10", # GREEK CAPITAL LETTER DELTA 154 "\x{0398}" => "\x19", # GREEK CAPITAL LETTER THETA 155 "\x{039B}" => "\x14", # GREEK CAPITAL LETTER LAMDA 156 "\x{039E}" => "\x1A", # GREEK CAPITAL LETTER XI 157 "\x{03A0}" => "\x16", # GREEK CAPITAL LETTER PI 158 "\x{03A3}" => "\x18", # GREEK CAPITAL LETTER SIGMA 159 "\x{03A6}" => "\x12", # GREEK CAPITAL LETTER PHI 160 "\x{03A8}" => "\x17", # GREEK CAPITAL LETTER PSI 161 "\x{03A9}" => "\x15", # GREEK CAPITAL LETTER OMEGA 162 "\x{20AC}" => "\x1B\x65", # EURO SIGN 163); 164our %GSM2UNI = reverse %UNI2GSM; 165our $ESC = "\x1b"; 166our $ATMARK = "\x40"; 167our $FBCHAR = "\x3F"; 168our $NBSP = "\x{00A0}"; 169 170#define ERR_DECODE_NOMAP "%s \"\\x%02" UVXf "\" does not map to Unicode" 171 172sub decode ($$;$) { 173 my ( $obj, $bytes, $chk ) = @_; 174 my $str = substr($bytes, 0, 0); # to propagate taintedness; 175 while ( length $bytes ) { 176 my $c = substr( $bytes, 0, 1, '' ); 177 my $u; 178 if ( $c eq "\x00" ) { 179 my $c2 = substr( $bytes, 0, 1, '' ); 180 $u = 181 !length $c2 ? $ATMARK 182 : $c2 eq "\x00" ? "\x{0000}" 183 : exists $GSM2UNI{$c2} ? $ATMARK . $GSM2UNI{$c2} 184 : $chk 185 ? croak sprintf( "\\x%02X\\x%02X does not map to Unicode", 186 ord($c), ord($c2) ) 187 : $ATMARK . $FBCHAR; 188 189 } 190 elsif ( $c eq $ESC ) { 191 my $c2 = substr( $bytes, 0, 1, '' ); 192 $u = 193 exists $GSM2UNI{ $c . $c2 } ? $GSM2UNI{ $c . $c2 } 194 : exists $GSM2UNI{$c2} ? $NBSP . $GSM2UNI{$c2} 195 : $chk 196 ? croak sprintf( "\\x%02X\\x%02X does not map to Unicode", 197 ord($c), ord($c2) ) 198 : $NBSP . $FBCHAR; 199 } 200 else { 201 $u = 202 exists $GSM2UNI{$c} 203 ? $GSM2UNI{$c} 204 : $chk ? ref $chk eq 'CODE' 205 ? $chk->( ord $c ) 206 : croak sprintf( "\\x%02X does not map to Unicode", ord($c) ) 207 : $FBCHAR; 208 } 209 $str .= $u; 210 } 211 $_[1] = $bytes if $chk; 212 return $str; 213} 214 215#define ERR_ENCODE_NOMAP "\"\\x{%04" UVxf "}\" does not map to %s" 216 217sub encode($$;$) { 218 my ( $obj, $str, $chk ) = @_; 219 my $bytes = substr($str, 0, 0); # to propagate taintedness 220 while ( length $str ) { 221 my $u = substr( $str, 0, 1, '' ); 222 my $c; 223 $bytes .= 224 exists $UNI2GSM{$u} 225 ? $UNI2GSM{$u} 226 : $chk ? ref $chk eq 'CODE' 227 ? $chk->( ord($u) ) 228 : croak sprintf( "\\x{%04x} does not map to %s", 229 ord($u), $obj->name ) 230 : $FBCHAR; 231 } 232 $_[1] = $str if $chk; 233 return $bytes; 234} 235 2361; 237__END__ 238 239=head1 NAME 240 241Encode::GSM0338 -- ESTI GSM 03.38 Encoding 242 243=head1 SYNOPSIS 244 245 use Encode qw/encode decode/; 246 $gsm0338 = encode("gsm0338", $utf8); # loads Encode::GSM0338 implicitly 247 $utf8 = decode("gsm0338", $gsm0338); # ditto 248 249=head1 DESCRIPTION 250 251GSM0338 is for GSM handsets. Though it shares alphanumerals with ASCII, 252control character ranges and other parts are mapped very differently, 253mainly to store Greek characters. There are also escape sequences 254(starting with 0x1B) to cover e.g. the Euro sign. 255 256This was once handled by L<Encode::Bytes> but because of all those 257unusual specifications, Encode 2.20 has relocated the support to 258this module. 259 260=head1 NOTES 261 262Unlike most other encodings, the following always croaks on error 263for any $chk that evaluates to true. 264 265 $gsm0338 = encode("gsm0338", $utf8 $chk); 266 $utf8 = decode("gsm0338", $gsm0338, $chk); 267 268So if you want to check the validity of the encoding, surround the 269expression with C<eval {}> block as follows; 270 271 eval { 272 $utf8 = decode("gsm0338", $gsm0338, $chk); 273 }; 274 if ($@){ 275 # handle exception here 276 } 277 278=head1 BUGS 279 280ESTI GSM 03.38 Encoding itself. 281 282Mapping \x00 to '@' causes too much pain everywhere. 283 284Its use of \x1b (escape) is also very questionable. 285 286Because of those two, the code paging approach used use in ucm-based 287Encoding SOMETIMES fails so this module was written. 288 289=head1 SEE ALSO 290 291L<Encode> 292 293=cut 294