1#!/usr/bin/perl -w
2
3# Copyright (C) 2003, 2004, 2005, 2006 Apple Computer, Inc. All rights reserved.
4#
5# Redistribution and use in source and binary forms, with or without
6# modification, are permitted provided that the following conditions
7# are met:
8#
9# 1.  Redistributions of source code must retain the above copyright
10#     notice, this list of conditions and the following disclaimer.
11# 2.  Redistributions in binary form must reproduce the above copyright
12#     notice, this list of conditions and the following disclaimer in the
13#     documentation and/or other materials provided with the distribution.
14# 3.  Neither the name of Apple Computer, Inc. ("Apple") nor the names of
15#     its contributors may be used to endorse or promote products derived
16#     from this software without specific prior written permission.
17#
18# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
19# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
22# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
25# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
27# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
29use strict;
30
31my %aliasesFromCharsetsFile;
32my %namesWritten;
33
34my $output = "";
35
36my $error = 0;
37
38sub error ($)
39{
40    print STDERR @_, "\n";
41    $error = 1;
42}
43
44sub emit_line
45{
46    my ($name, $prefix, $encoding, $flags) = @_;
47
48    error "$name shows up twice in output" if $namesWritten{$name};
49    $namesWritten{$name} = 1;
50
51    $output .= "        { \"$name\", $prefix$encoding },\n";
52}
53
54sub process_platform_encodings
55{
56    my ($filename, $PlatformPrefix) = @_;
57    my $baseFilename = $filename;
58    $baseFilename =~ s|.*/||;
59
60    my %seenPlatformNames;
61    my %seenIANANames;
62
63    open PLATFORM_ENCODINGS, $filename or die;
64
65    while (<PLATFORM_ENCODINGS>) {
66        chomp;
67        s/\#.*$//;
68        s/\s+$//;
69        if (my ($PlatformName, undef, $flags, $IANANames) = /^(.+?)(, (.+))?: (.+)$/) {
70            my %aliases;
71
72            my $PlatformNameWithFlags = $PlatformName;
73            if ($flags) {
74                $PlatformNameWithFlags .= ", " . $flags;
75            } else {
76                $flags = "NoEncodingFlags";
77            }
78            error "Platform encoding name $PlatformName is mentioned twice in $baseFilename" if $seenPlatformNames{$PlatformNameWithFlags};
79            $seenPlatformNames{$PlatformNameWithFlags} = 1;
80
81            # Build the aliases list.
82            # Also check that no two names are part of the same entry in the charsets file.
83            my @IANANames = split ", ", $IANANames;
84            my $firstName = "";
85            my $canonicalFirstName = "";
86            my $prevName = "";
87            for my $name (@IANANames) {
88                if ($firstName eq "") {
89                    if ($name !~ /^[-A-Za-z0-9_]+$/) {
90                        error "$name, in $baseFilename, has illegal characters in it";
91                        next;
92                    }
93                    $firstName = $name;
94                } else {
95                    if ($name !~ /^[a-z0-9]+$/) {
96                        error "$name, in $baseFilename, has illegal characters in it (must be all lowercase alphanumeric)";
97                        next;
98                    }
99                    if ($name le $prevName) {
100                        error "$name comes after $prevName in $baseFilename, but everything must be in alphabetical order";
101                    }
102                    $prevName = $name;
103                }
104
105                my $canonicalName = lc $name;
106                $canonicalName =~ tr/-_//d;
107
108                $canonicalFirstName = $canonicalName if $canonicalFirstName eq "";
109
110                error "$name is mentioned twice in $baseFilename" if $seenIANANames{$canonicalName};
111                $seenIANANames{$canonicalName} = 1;
112
113                $aliases{$canonicalName} = 1;
114                next if !$aliasesFromCharsetsFile{$canonicalName};
115                for my $alias (@{$aliasesFromCharsetsFile{$canonicalName}}) {
116                    $aliases{$alias} = 1;
117                }
118                for my $otherName (@IANANames) {
119                    next if $canonicalName eq $otherName;
120                    if ($aliasesFromCharsetsFile{$otherName}
121                        && $aliasesFromCharsetsFile{$canonicalName} eq $aliasesFromCharsetsFile{$otherName}
122                        && $canonicalName le $otherName) {
123                        error "$baseFilename lists both $name and $otherName under $PlatformName, but that aliasing is already specified in character-sets.txt";
124                    }
125                }
126            }
127
128            # write out
129            emit_line($firstName, $PlatformPrefix, $PlatformName, $flags);
130            for my $alias (sort keys %aliases) {
131                emit_line($alias, $PlatformPrefix, $PlatformName, $flags) if $alias ne $canonicalFirstName;
132            }
133        } elsif (/^([a-zA-Z0-9_]+)(, (.+))?$/) {
134            my $PlatformName = $1;
135
136            error "Platform encoding name $PlatformName is mentioned twice in $baseFilename" if $seenPlatformNames{$PlatformName};
137            $seenPlatformNames{$PlatformName} = 1;
138        } elsif (/./) {
139            error "syntax error in $baseFilename, line $.";
140        }
141    }
142
143    close PLATFORM_ENCODINGS;
144}
145
146sub process_iana_charset
147{
148    my ($canonical_name, @aliases) = @_;
149
150    return if !$canonical_name;
151
152    my @names = sort $canonical_name, @aliases;
153
154    for my $name (@names) {
155        $aliasesFromCharsetsFile{$name} = \@names;
156    }
157}
158
159sub process_iana_charsets
160{
161    my ($filename) = @_;
162
163    open CHARSETS, $filename or die;
164
165    my %seen;
166
167    my $canonical_name;
168    my @aliases;
169
170    my %exceptions = ( isoir91 => 1, isoir92 => 1 );
171
172    while (<CHARSETS>) {
173        chomp;
174        if ((my $new_canonical_name) = /Name: ([^ \t]*).*/) {
175            $new_canonical_name = lc $new_canonical_name;
176            $new_canonical_name =~ tr/a-z0-9//cd;
177
178            error "saw $new_canonical_name twice in character-sets.txt", if $seen{$new_canonical_name};
179            $seen{$new_canonical_name} = $new_canonical_name;
180
181            process_iana_charset $canonical_name, @aliases;
182
183            $canonical_name = $new_canonical_name;
184            @aliases = ();
185        } elsif ((my $new_alias) = /Alias: ([^ \t]*).*/) {
186            $new_alias = lc $new_alias;
187            $new_alias =~ tr/a-z0-9//cd;
188
189            # do this after normalizing the alias, sometimes character-sets.txt
190            # has weird escape characters, e.g. \b after None
191            next if $new_alias eq "none";
192
193            error "saw $new_alias twice in character-sets.txt $seen{$new_alias}, $canonical_name", if $seen{$new_alias} && $seen{$new_alias} ne $canonical_name && !$exceptions{$new_alias};
194            push @aliases, $new_alias if !$seen{$new_alias};
195            $seen{$new_alias} = $canonical_name;
196        }
197    }
198
199    process_iana_charset $canonical_name, @aliases;
200
201    close CHARSETS;
202}
203
204# Program body
205
206process_iana_charsets($ARGV[0]);
207process_platform_encodings($ARGV[1], $ARGV[2]);
208
209exit 1 if $error;
210
211print <<EOF
212// File generated by make-charset-table.pl. Do not edit!
213
214#include "config.h"
215#include "CharsetData.h"
216
217namespace WebCore {
218
219    const CharsetEntry CharsetTable[] = {
220$output
221        { 0, 0 }
222    };
223
224}
225EOF
226