1#!/usr/bin/env perl
2
3use warnings;
4use strict;
5use Encode;
6use Switch;
7use Unicode::Normalize;
8use utf8;  # This source file MUST be stored UTF-8 encoded
9
10###############################################################################
11# code page data builder, by magnum / JimF.   v1.2
12# August 8, added parsing of ./UnicodeData.txt for building more macros
13# Coded July-Aug 2011, as a tool to build codepage encoding data needed
14# for John the Ripper code page conversions.  The data output from this file
15# is made to be directly placed into the ./src/encoding_data.h file in john's
16# source tree.
17# UnicodeData.txt is an official Unicode definition file and can be found at
18# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
19# USAGE:  cmpt_cp.pl [-v] CODEPAGE
20# cmpt_cp.pl run without any arguments will show a list of possible code pages.
21###############################################################################
22
23# This should set our output to your terminal settings
24use open ':locale';
25
26# Set to 1 to permanently enable Unicode comments
27my $verbose = 1;
28if ($ARGV[0] eq "-v") {
29	$verbose++;
30	shift;
31}
32
33my $enc;
34if (@ARGV==1) {$enc=$ARGV[0];}
35else {
36	print "Supported encodings:\n", join(", ", Encode->encodings(":all")), "\n\n";
37	exit(0);
38}
39
40my %cat;
41my $filename = "UnicodeData.txt";
42my @subdirs = qw(unused Unicode);
43my $subdir = ".";
44foreach my $sd (@subdirs) {
45    if (-f "${sd}/${filename}" ) {
46	$subdir = $sd;
47    }
48}
49open FILE, "$subdir/$filename" or die $!;
50while (my $line = <FILE>) {
51	next if substr($line,0,1) eq "#";
52	my @line = split(';', $line);
53	$cat{hex($line[0])} = $line[2];
54}
55
56sub lookupCategory {
57	my $c = shift;
58	return $cat{$c};
59}
60
61sub printdef {
62	my $param = shift;
63	if (length($param)>80) {print" \\\n\t";}
64	elsif (length($param)>0) {print" ";}
65	if (length($param)>0)  {print "\"".$param."\"";}
66}
67
68sub printdef_null {
69	my $param = shift;
70	if (length($param)>80) {print" \\\n\t";}
71	else {print" ";}
72	print "\"".$param."\"";
73}
74
75my $to_unicode_high128="";
76my $lower="";  my $upper="";  my $lowonly="";  my $uponly="";  my $specials = "";  my $punctuation = "";  my $alpha = "";  my $digits = "";  my $control = "";  my $invalid = ""; my $whitespace = ""; my $vowels = "\\x59\\x79"; my $consonants = ""; my $nocase = "";
77my $clower=""; my $cupper=""; my $clowonly=""; my $cuponly=""; my $cspecials = ""; my $cpunctuation = ""; my $calpha = ""; my $cdigits = ""; my $cvowels = "Yy"; my $cconsonants = ""; my $cnocase = "";
78my $encu = uc($enc);my $hs = "";
79$encu =~ s/-/_/g;
80#######################################
81# first step, compute the unicode array
82#######################################
83foreach my $i (0x80..0xFF) {
84	my $u = chr($i);
85	$u = Encode::decode($enc, $u);
86	$hs .= $u;
87	if (ord($u) == 0xfffd) {
88		$u = chr($i);
89	}
90	$to_unicode_high128 .= "0x" . sprintf "%04X", ord($u);
91	if ($i % 16 == 15 && $i != 255) { $to_unicode_high128 .= ",\n"; }
92	elsif ($i != 255) { $to_unicode_high128 .= ","; }
93}
94if ($verbose) {
95	print "\n// "; foreach (8..9, 'A'..'F') { print $_, " "x15 };
96	print "\n// "; foreach (8..9, 'A'..'F') { print '0'..'9','A'..'F' };
97	print "\n// ", $hs, "\n";
98}
99print "\n// here is the $encu to Unicode conversion for $encu characters from 0x80 to 0xFF\n";
100print "static const UTF16 ".$encu."_to_unicode_high128[] = {\n";
101print $to_unicode_high128 . " };\n";
102
103#################################
104# Now build upcase/downcase data.
105#################################
106foreach my $i (0x80..0xFF) {
107	my $c = chr($i);
108	# converts $c into utf8, from $enc code page, and 'sets' the 'flag' in perl that $c IS a utf8 char.
109	$c = Encode::decode($enc, $c);
110
111	# upcase and low case the utf8 chars
112	my $ulc = lc $c; my $uuc = uc $c;
113	# reconvert the utf8 char's back into $enc code page.
114	my $elc = Encode::encode($enc, $ulc); my $euc = Encode::encode($enc, $uuc);
115	if ( (chr($i) eq $elc || chr($i) eq $euc) && $elc ne $euc) {
116	    if (chr($i) ne $euc) {
117			if (chr($i) ne $elc && chr($i) ne $euc) {
118				no warnings;
119				printf("// *** WARNING, char at 0x%X U+%04X (%s) needs to be looked into. Neither conversion gets back to original value!\n",$i,ord($c), $c);
120			} elsif ( length($euc) > 1) {
121				$lowonly .= sprintf("\\x%02X", ord($elc));
122				$clowonly .= $c;
123				printf("// *** WARNING, char at 0x%X U+%04X (%s -> %s) needs to be looked into.  Single to multi-byte conversion\n",$i,ord($c), $ulc, $uuc);
124			} elsif ( length($elc) > 1) {
125				$uponly .= sprintf("\\x%02X", ord($euc));
126				$cuponly .= $c;
127				printf("// *** WARNING, char at 0x%X U+%04X (%s -> %s) needs to be looked into.  Single to multi-byte conversion\n",$i,ord($c), $ulc, $uuc);
128			} elsif ( ord($euc) < 0x80) {
129				$lowonly .= sprintf("\\x%02X", ord($elc));
130				$clowonly .= $c;
131				if (ord($euc) != 0x3f) {
132					printf("// *** WARNING, char at 0x%X -> U+%04X -> U+%04X -> 0x%X (%s -> %s) needs to be looked into.  Likely one way casing conversion\n",$i,ord($ulc),ord($uuc),ord($euc), $ulc, $uuc);
133				}
134			} elsif ( ord($elc) < 0x80) {
135				$uponly .= sprintf("\\x%02X", ord($euc));
136				$cuponly .= $c;
137				if (ord($elc) != 0x3f) {
138					printf("// *** WARNING, char at 0x%X -> U+%04X -> U+%04X -> 0x%X (%s -> %s) needs to be looked into.  Likely one way casing conversion\n",$i,ord($ulc),ord($uuc),ord($euc), $ulc, $uuc);
139				}
140			} else {
141				$lower .= sprintf("\\x%02X", ord($elc));
142				$clower .= lc($c);
143				$upper .= sprintf("\\x%02X", ord($euc));
144				$cupper .= uc($c);
145			}
146		}
147	} else {
148		# NOTE, we can have letters which fail above.  Examples are U+00AA, U+00BA.  These are letters, lower case only, and there IS no upper case.
149		# this causes the original if to not find them. Thus, we 'look them up' here.
150		my $cat = lookupCategory(ord($c));
151		#printf STDERR "Category: $cat\n";
152		switch ($cat) {
153			case /^Ll/ { $lowonly .= sprintf("\\x%02X", ord($elc)); $clowonly .= $c; }
154			case /^Lu/ { $uponly  .= sprintf("\\x%02X", ord($euc)); $cuponly  .= $c; }
155			else {}
156		}
157	}
158
159	if (ord($c) == 0xfffd) {
160		$invalid .= sprintf("\\x%02X", $i);
161	} else {
162		my $cat = lookupCategory(ord($c));
163		switch ($cat) {
164			case /^Cf/ { $specials .= sprintf("\\x%02X", $i); $cspecials .= $c }
165			case /^L[lotu]/ {
166				$alpha .= sprintf("\\x%02X", $i);
167				$calpha .= $c;
168				if ($cat =~ /^Lo/) {
169					$nocase .= sprintf("\\x%02X", $i); $cnocase .= $c
170				}
171				# best-effort vowel/consonant matching
172				# We normalize to decomposed and match known vowels in lc
173				my $nfd = substr(NFD($c), 0, 1);
174				# Done: Latin, Nordic, Greek, Russian, Ukrainian, Turkish
175				if ($nfd =~ m/[aoueiyœæøɪʏɛɔαεηιοωυаэыуояеюиєіı]/i) {
176					$vowels .= sprintf("\\x%02X", $i);
177					$cvowels .= $c;
178					# Note, e.g., in English, y depends on situation
179					# (yellow, happy). We set latin yY variants as both!
180					if ($nfd =~ m/y/i) {
181						$consonants .= sprintf("\\x%02X", $i);
182						$cconsonants .= $c;
183					}
184				} else {
185					$consonants .= sprintf("\\x%02X", $i);
186					$cconsonants .= $c;
187				}
188			}
189			case /^Lm/ { $specials .= sprintf("\\x%02X", $i); $cspecials .= $c }
190			#case /^Ll/ { $lower .= sprintf("\\x%02X", $i); }
191			#case /^L[tu]/ { $upper .= sprintf("\\x%02X", $i); }
192			case /^M[cen]/ { $specials .= sprintf("\\x%02X", $i); $cspecials .= $c }
193			case /^S[ckmo]/ { $specials .= sprintf("\\x%02X", $i); $cspecials .= $c }
194			case /^N[dlo]/ { $digits .= sprintf("\\x%02X", $i); $cdigits .= $c }
195			case /^P[cdefios]/ { $punctuation .= sprintf("\\x%02X", $i); $cpunctuation .= $c }
196			case /^Z[lps]/ { $whitespace .= sprintf("\\x%02X", $i); }
197			case /^C/ { $control .= sprintf("\\x%02X", $i); }
198			else { print STDERR "*** Warning, $cat not handled\n"; }
199		}
200	}
201}
202print "\n// $clower\n" if $verbose;
203print "#define CHARS_LOWER_".$encu;
204printdef_null($lower);
205print "\n";
206
207print "\n// $clowonly\n" if $verbose;
208print "#define CHARS_LOW_ONLY_".$encu;
209printdef($lowonly);
210print "\n";
211
212print "\n// $cupper\n" if $verbose;
213print "#define CHARS_UPPER_".$encu;
214printdef_null($upper);
215print "\n";
216
217print "\n// $cuponly\n" if $verbose;
218print "#define CHARS_UP_ONLY_".$encu;
219printdef($uponly);
220print "\n";
221
222print "\n// $cnocase\n" if $verbose;
223print "#define CHARS_NOCASE_".$encu;
224printdef($nocase);
225print "\n";
226
227print "\n// $cdigits\n" if $verbose;
228print "#define CHARS_DIGITS_".$encu;
229printdef_null($digits);
230print "\n";
231
232print "\n// $cpunctuation\n" if $verbose;
233print "#define CHARS_PUNCTUATION_".$encu;
234printdef($punctuation);
235print "\n";
236
237print "\n// $cspecials\n" if $verbose;
238print "#define CHARS_SPECIALS_".$encu;
239printdef($specials);
240print "\n";
241
242print "\n// $calpha\n" if $verbose;
243print "#define CHARS_ALPHA_".$encu;
244printdef($alpha);
245print "\n";
246
247print "\n" if $verbose;
248print "#define CHARS_WHITESPACE_".$encu;
249printdef($whitespace);
250print "\n";
251
252print "\n" if $verbose;
253print "#define CHARS_CONTROL_".$encu;
254printdef($control);
255print "\n";
256
257print "\n" if $verbose;
258print "#define CHARS_INVALID_".$encu;
259printdef_null($invalid);
260print "\n";
261
262print "\n// $cvowels\n" if $verbose;
263print "#define CHARS_VOWELS_".$encu;
264printdef($vowels);
265print "\n";
266
267print "\n// $cconsonants\n" if $verbose;
268print "#define CHARS_CONSONANTS_".$encu;
269printdef($consonants);
270print "\n";
271
272####################################################################
273# Ok, provide a check to see if any of the characters UNDER 0x80
274# are non-standard.  At this time, there is no plan on HOW to handle
275# this within john.  The information is simply listed at this time.
276####################################################################
277foreach my $i (0x20..0x7E) {
278	my $u = chr($i);
279	Encode::from_to($u, $enc, "utf8");
280	my $str = sprintf "%04X", ord Encode::decode("UTF-8", $u);
281	if ( hex($str) != $i) { printf("WARNING, low character %X maps into Unicode 0x%s\n", $i, $str);}
282}
283