1#!/usr/local/bin/perl
2# convert TeX (Patgen) hyphenation patterns to Libhnj format
3# (A utility for finding substring embeddings in patterns)
4# usage: substrings.pl inputfile outputfile [encoding]
5
6if (!defined $ARGV[1]) {
7    print "" .
8"substrings.pl - convert TeX (Patgen) hyphenation patterns to Libhnj format\n" .
9"(A utility for finding substring embeddings in patterns)\n" .
10"usage: substrings.pl infile outfile [encoding [lefthyphenmin [righthyphenmin]]]\n";
11    exit 1;
12}
13$fn = $ARGV[0];
14if (!-e $fn) { $fn = "hyphen.us"; }
15open HYPH, $fn;
16open OUT, ">$ARGV[1]";
17$encoding = $ARGV[2];
18$lhmin = $ARGV[3];
19$rhmin = $ARGV[4];
20if (defined $encoding) { print OUT "$encoding\n"; }
21if (defined $lhmin) { print OUT "LEFTHYPHENMIN $lhmin\n"; }
22if (defined $rhmin) { print OUT "RIGHTHYPHENMIN $rhmin\n"; }
23
24while (<HYPH>)
25{
26    $pat =~ s/%.*$//g;
27    if (/^\%/) {
28	#comment, ignore
29    } elsif (/^(.+)\/([^,]+),([0-9]+),([0-9]+)$/) {
30        $origpat = $1;
31	$pat = $1;
32        $repl = $2;
33        $beg = $3;
34        $len = $4;
35	$pat =~ s/\d//g;
36        if ($origpat eq $pat) {
37            print "error - missing hyphenation point: $_";
38            exit 1;
39        }
40	push @patlist, $pat;
41	$pattab{$pat} = $origpat;
42        $repltab{$pat} = $repl;
43        $replbeg{$pat} = $beg - 1;
44        $repllen{$pat} = $len;
45    } elsif (/^(.+)\/(.+)$/) {
46        $origpat = $1;
47	$pat = $1;
48        $repl = $2;
49	$pat =~ s/\d//g;
50        if ($origpat eq $pat) {
51            print "error - missing hyphenation point: $_";
52            exit 1;
53        }
54	push @patlist, $pat;
55	$pattab{$pat} = $origpat;
56        $repltab{$pat} = $repl;
57        $replbeg{$pat} = 0;
58        $repllen{$pat} = enclen($pat);
59    } elsif (/^(.+)$/) {
60	$origpat = $1;
61	$pat = $1;
62	$pat =~ s/\d//g;
63	push @patlist, $pat;
64	$pattab{$pat} = $origpat;
65    }
66}
67
68foreach $pat (@patlist) {
69    $patsize = length $pat;
70    for $i (0..$patsize - 1) {
71	for $j (1..$patsize - $i) {
72	    $subpat = substr ($pat, $i, $j);
73	    if (defined $pattab{$subpat}) {
74		print "$pattab{$subpat} is embedded in $pattab{$pat}\n";
75		$newpat = substr $pat, 0, $i + $j;
76		if (!defined $newpattab{$newpat}) {
77		    $newpattab{$newpat} =
78			substr ($pat, 0, $i).$pattab{$subpat};
79		    $ss = substr $pat, 0, $i;
80		    print "$ss+$pattab{$subpat}\n";
81		    push @newpatlist, $newpat;
82		    if (defined $repltab{$subpat}) {
83                        $begcorr = (($pat =~ /^[.]/) && !($subpat =~ /^[.]/)) ? 1 : 0;
84                        $newrepltab{$newpat} = $repltab{$subpat};
85                        $newreplbeg{$newpat} = $replbeg{$subpat} + enclen($ss) - $begcorr;
86                        $newrepllen{$newpat} = $repllen{$subpat};
87                    }
88		} else {
89		    $tmp =  $newpattab{$newpat};
90		    $newpattab{$newpat} =
91			combine ($newpattab{$newpat}, $pattab{$subpat});
92		    print "$tmp + $pattab{$subpat} -> $newpattab{$newpat}\n";
93		}
94	    }
95	}
96    }
97}
98
99foreach $pat (@newpatlist) {
100    if (defined $newrepltab{$pat}) {
101        print OUT $newpattab{$pat}."/".$newrepltab{$pat}.",".($newreplbeg{$pat}+1).",".$newrepllen{$pat}."\n";
102    } else {
103        print OUT $newpattab{$pat}."\n";
104    }
105}
106
107#convert 'n1im' to 0n1i0m0 expresed as a list
108sub expand {
109    my ($pat) = @_;
110    my $last = '.';
111    my @exp = ();
112
113    foreach $c (split (//, $pat)) {
114	if ($last =~ /[\D]/ && $c =~ /[\D]/) {
115	    push @exp, 0;
116	}
117	push @exp, $c;
118	$last = $c;
119    }
120    if ($last =~ /[\D]/) {
121	push @exp, 0;
122    }
123    return @exp;
124}
125
126# Combine two patterns, i.e. .ad4der + a2d becomes .a2d4der
127# The second pattern needs to be a substring of the first (modulo digits)
128sub combine {
129    my @exp = expand shift;
130    my @subexp = expand shift;
131    my $pat1, $pat2;
132    my $i;
133
134    $pat1 = join ('', map { $_ =~ /\d/ ? () : $_ } @exp);
135    $pat2 = join ('', map { $_ =~ /\d/ ? () : $_ } @subexp);
136
137    $begcorr = ($pat1 =~ /^[.]/) ? 1 : 0;
138
139    for $i (0..length ($pat1) - length ($pat2)) {
140	if (substr ($pat1, $i, length $pat2) eq $subpat) {
141	    for ($j = 0; $j < @subexp; $j += 2) {
142		if ($subexp[$j] > $exp[2 * $i + $j]) {
143		    $exp[2 * $i + $j] = $subexp[$j];
144                    if (defined $newrepltab{$pat2} && !defined $newrepltab{$pat1}) {
145                        $ss = substr ($pat1, 0, $i);
146                        $newrepltab{$pat1} = $newrepltab{$pat2};
147                        $newreplbeg{$pat1} = $newreplbeg{$pat2} + enclen($ss) - $begcorr;
148                        $newrepllen{$pat1} = $newrepllen{$pat2};
149                    }
150		}
151	    }
152	    print ("$pat1 includes $pat2 at pos $i\n");
153	}
154    }
155    return join ('', map { $_ eq '0' ? () : $_ } @exp);
156}
157
158# 8 bit or UTF-8 character length (calculating right start position for discretionary hyphenation)
159sub enclen {
160    my $nonchar = 0;
161    my $len = length($_[0]);
162    if ($encoding eq "UTF-8") {
163        # length of an UTF-8 string equals to the count of the characters not started with '10' bits
164        for ($i = 0; $i < $len; $i++) {
165            if ((ord(substr($_[0], $i, 1)) >> 6) == 2) { $nonchar++; }
166        }
167    }
168    return $len - $nonchar;
169}
170