1#!/usr/bin/env perl 2 3use warnings; 4use strict; 5use Encode; 6use Switch; 7use Unicode::Normalize; 8use utf8; # This source file MUST be stored UTF-8 encoded 9 10############################################################################### 11# code page data builder, by magnum / JimF. v1.2 12# August 8, added parsing of ./UnicodeData.txt for building more macros 13# Coded July-Aug 2011, as a tool to build codepage encoding data needed 14# for John the Ripper code page conversions. The data output from this file 15# is made to be directly placed into the ./src/encoding_data.h file in john's 16# source tree. 17# UnicodeData.txt is an official Unicode definition file and can be found at 18# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt 19# USAGE: cmpt_cp.pl [-v] CODEPAGE 20# cmpt_cp.pl run without any arguments will show a list of possible code pages. 21############################################################################### 22 23# This should set our output to your terminal settings 24use open ':locale'; 25 26# Set to 1 to permanently enable Unicode comments 27my $verbose = 1; 28if ($ARGV[0] eq "-v") { 29 $verbose++; 30 shift; 31} 32 33my $enc; 34if (@ARGV==1) {$enc=$ARGV[0];} 35else { 36 print "Supported encodings:\n", join(", ", Encode->encodings(":all")), "\n\n"; 37 exit(0); 38} 39 40my %cat; 41my $filename = "UnicodeData.txt"; 42my @subdirs = qw(unused Unicode); 43my $subdir = "."; 44foreach my $sd (@subdirs) { 45 if (-f "${sd}/${filename}" ) { 46 $subdir = $sd; 47 } 48} 49open FILE, "$subdir/$filename" or die $!; 50while (my $line = <FILE>) { 51 next if substr($line,0,1) eq "#"; 52 my @line = split(';', $line); 53 $cat{hex($line[0])} = $line[2]; 54} 55 56sub lookupCategory { 57 my $c = shift; 58 return $cat{$c}; 59} 60 61sub printdef { 62 my $param = shift; 63 if (length($param)>80) {print" \\\n\t";} 64 elsif (length($param)>0) {print" ";} 65 if (length($param)>0) {print "\"".$param."\"";} 66} 67 68sub printdef_null { 69 my $param = shift; 70 if (length($param)>80) {print" \\\n\t";} 71 else {print" ";} 72 print "\"".$param."\""; 73} 74 75my $to_unicode_high128=""; 76my $lower=""; my $upper=""; my $lowonly=""; my $uponly=""; my $specials = ""; my $punctuation = ""; my $alpha = ""; my $digits = ""; my $control = ""; my $invalid = ""; my $whitespace = ""; my $vowels = "\\x59\\x79"; my $consonants = ""; my $nocase = ""; 77my $clower=""; my $cupper=""; my $clowonly=""; my $cuponly=""; my $cspecials = ""; my $cpunctuation = ""; my $calpha = ""; my $cdigits = ""; my $cvowels = "Yy"; my $cconsonants = ""; my $cnocase = ""; 78my $encu = uc($enc);my $hs = ""; 79$encu =~ s/-/_/g; 80####################################### 81# first step, compute the unicode array 82####################################### 83foreach my $i (0x80..0xFF) { 84 my $u = chr($i); 85 $u = Encode::decode($enc, $u); 86 $hs .= $u; 87 if (ord($u) == 0xfffd) { 88 $u = chr($i); 89 } 90 $to_unicode_high128 .= "0x" . sprintf "%04X", ord($u); 91 if ($i % 16 == 15 && $i != 255) { $to_unicode_high128 .= ",\n"; } 92 elsif ($i != 255) { $to_unicode_high128 .= ","; } 93} 94if ($verbose) { 95 print "\n// "; foreach (8..9, 'A'..'F') { print $_, " "x15 }; 96 print "\n// "; foreach (8..9, 'A'..'F') { print '0'..'9','A'..'F' }; 97 print "\n// ", $hs, "\n"; 98} 99print "\n// here is the $encu to Unicode conversion for $encu characters from 0x80 to 0xFF\n"; 100print "static const UTF16 ".$encu."_to_unicode_high128[] = {\n"; 101print $to_unicode_high128 . " };\n"; 102 103################################# 104# Now build upcase/downcase data. 105################################# 106foreach my $i (0x80..0xFF) { 107 my $c = chr($i); 108 # converts $c into utf8, from $enc code page, and 'sets' the 'flag' in perl that $c IS a utf8 char. 109 $c = Encode::decode($enc, $c); 110 111 # upcase and low case the utf8 chars 112 my $ulc = lc $c; my $uuc = uc $c; 113 # reconvert the utf8 char's back into $enc code page. 114 my $elc = Encode::encode($enc, $ulc); my $euc = Encode::encode($enc, $uuc); 115 if ( (chr($i) eq $elc || chr($i) eq $euc) && $elc ne $euc) { 116 if (chr($i) ne $euc) { 117 if (chr($i) ne $elc && chr($i) ne $euc) { 118 no warnings; 119 printf("// *** WARNING, char at 0x%X U+%04X (%s) needs to be looked into. Neither conversion gets back to original value!\n",$i,ord($c), $c); 120 } elsif ( length($euc) > 1) { 121 $lowonly .= sprintf("\\x%02X", ord($elc)); 122 $clowonly .= $c; 123 printf("// *** WARNING, char at 0x%X U+%04X (%s -> %s) needs to be looked into. Single to multi-byte conversion\n",$i,ord($c), $ulc, $uuc); 124 } elsif ( length($elc) > 1) { 125 $uponly .= sprintf("\\x%02X", ord($euc)); 126 $cuponly .= $c; 127 printf("// *** WARNING, char at 0x%X U+%04X (%s -> %s) needs to be looked into. Single to multi-byte conversion\n",$i,ord($c), $ulc, $uuc); 128 } elsif ( ord($euc) < 0x80) { 129 $lowonly .= sprintf("\\x%02X", ord($elc)); 130 $clowonly .= $c; 131 if (ord($euc) != 0x3f) { 132 printf("// *** WARNING, char at 0x%X -> U+%04X -> U+%04X -> 0x%X (%s -> %s) needs to be looked into. Likely one way casing conversion\n",$i,ord($ulc),ord($uuc),ord($euc), $ulc, $uuc); 133 } 134 } elsif ( ord($elc) < 0x80) { 135 $uponly .= sprintf("\\x%02X", ord($euc)); 136 $cuponly .= $c; 137 if (ord($elc) != 0x3f) { 138 printf("// *** WARNING, char at 0x%X -> U+%04X -> U+%04X -> 0x%X (%s -> %s) needs to be looked into. Likely one way casing conversion\n",$i,ord($ulc),ord($uuc),ord($euc), $ulc, $uuc); 139 } 140 } else { 141 $lower .= sprintf("\\x%02X", ord($elc)); 142 $clower .= lc($c); 143 $upper .= sprintf("\\x%02X", ord($euc)); 144 $cupper .= uc($c); 145 } 146 } 147 } else { 148 # NOTE, we can have letters which fail above. Examples are U+00AA, U+00BA. These are letters, lower case only, and there IS no upper case. 149 # this causes the original if to not find them. Thus, we 'look them up' here. 150 my $cat = lookupCategory(ord($c)); 151 #printf STDERR "Category: $cat\n"; 152 switch ($cat) { 153 case /^Ll/ { $lowonly .= sprintf("\\x%02X", ord($elc)); $clowonly .= $c; } 154 case /^Lu/ { $uponly .= sprintf("\\x%02X", ord($euc)); $cuponly .= $c; } 155 else {} 156 } 157 } 158 159 if (ord($c) == 0xfffd) { 160 $invalid .= sprintf("\\x%02X", $i); 161 } else { 162 my $cat = lookupCategory(ord($c)); 163 switch ($cat) { 164 case /^Cf/ { $specials .= sprintf("\\x%02X", $i); $cspecials .= $c } 165 case /^L[lotu]/ { 166 $alpha .= sprintf("\\x%02X", $i); 167 $calpha .= $c; 168 if ($cat =~ /^Lo/) { 169 $nocase .= sprintf("\\x%02X", $i); $cnocase .= $c 170 } 171 # best-effort vowel/consonant matching 172 # We normalize to decomposed and match known vowels in lc 173 my $nfd = substr(NFD($c), 0, 1); 174 # Done: Latin, Nordic, Greek, Russian, Ukrainian, Turkish 175 if ($nfd =~ m/[aoueiyœæøɪʏɛɔαεηιοωυаэыуояеюиєіı]/i) { 176 $vowels .= sprintf("\\x%02X", $i); 177 $cvowels .= $c; 178 # Note, e.g., in English, y depends on situation 179 # (yellow, happy). We set latin yY variants as both! 180 if ($nfd =~ m/y/i) { 181 $consonants .= sprintf("\\x%02X", $i); 182 $cconsonants .= $c; 183 } 184 } else { 185 $consonants .= sprintf("\\x%02X", $i); 186 $cconsonants .= $c; 187 } 188 } 189 case /^Lm/ { $specials .= sprintf("\\x%02X", $i); $cspecials .= $c } 190 #case /^Ll/ { $lower .= sprintf("\\x%02X", $i); } 191 #case /^L[tu]/ { $upper .= sprintf("\\x%02X", $i); } 192 case /^M[cen]/ { $specials .= sprintf("\\x%02X", $i); $cspecials .= $c } 193 case /^S[ckmo]/ { $specials .= sprintf("\\x%02X", $i); $cspecials .= $c } 194 case /^N[dlo]/ { $digits .= sprintf("\\x%02X", $i); $cdigits .= $c } 195 case /^P[cdefios]/ { $punctuation .= sprintf("\\x%02X", $i); $cpunctuation .= $c } 196 case /^Z[lps]/ { $whitespace .= sprintf("\\x%02X", $i); } 197 case /^C/ { $control .= sprintf("\\x%02X", $i); } 198 else { print STDERR "*** Warning, $cat not handled\n"; } 199 } 200 } 201} 202print "\n// $clower\n" if $verbose; 203print "#define CHARS_LOWER_".$encu; 204printdef_null($lower); 205print "\n"; 206 207print "\n// $clowonly\n" if $verbose; 208print "#define CHARS_LOW_ONLY_".$encu; 209printdef($lowonly); 210print "\n"; 211 212print "\n// $cupper\n" if $verbose; 213print "#define CHARS_UPPER_".$encu; 214printdef_null($upper); 215print "\n"; 216 217print "\n// $cuponly\n" if $verbose; 218print "#define CHARS_UP_ONLY_".$encu; 219printdef($uponly); 220print "\n"; 221 222print "\n// $cnocase\n" if $verbose; 223print "#define CHARS_NOCASE_".$encu; 224printdef($nocase); 225print "\n"; 226 227print "\n// $cdigits\n" if $verbose; 228print "#define CHARS_DIGITS_".$encu; 229printdef_null($digits); 230print "\n"; 231 232print "\n// $cpunctuation\n" if $verbose; 233print "#define CHARS_PUNCTUATION_".$encu; 234printdef($punctuation); 235print "\n"; 236 237print "\n// $cspecials\n" if $verbose; 238print "#define CHARS_SPECIALS_".$encu; 239printdef($specials); 240print "\n"; 241 242print "\n// $calpha\n" if $verbose; 243print "#define CHARS_ALPHA_".$encu; 244printdef($alpha); 245print "\n"; 246 247print "\n" if $verbose; 248print "#define CHARS_WHITESPACE_".$encu; 249printdef($whitespace); 250print "\n"; 251 252print "\n" if $verbose; 253print "#define CHARS_CONTROL_".$encu; 254printdef($control); 255print "\n"; 256 257print "\n" if $verbose; 258print "#define CHARS_INVALID_".$encu; 259printdef_null($invalid); 260print "\n"; 261 262print "\n// $cvowels\n" if $verbose; 263print "#define CHARS_VOWELS_".$encu; 264printdef($vowels); 265print "\n"; 266 267print "\n// $cconsonants\n" if $verbose; 268print "#define CHARS_CONSONANTS_".$encu; 269printdef($consonants); 270print "\n"; 271 272#################################################################### 273# Ok, provide a check to see if any of the characters UNDER 0x80 274# are non-standard. At this time, there is no plan on HOW to handle 275# this within john. The information is simply listed at this time. 276#################################################################### 277foreach my $i (0x20..0x7E) { 278 my $u = chr($i); 279 Encode::from_to($u, $enc, "utf8"); 280 my $str = sprintf "%04X", ord Encode::decode("UTF-8", $u); 281 if ( hex($str) != $i) { printf("WARNING, low character %X maps into Unicode 0x%s\n", $i, $str);} 282} 283