1#!/usr/bin/env python3
2#
3# Script to generate tables for cpp_wcwidth, leveraging glibc's utf8_gen.py.
4#
5# This file is part of GCC.
6#
7# GCC is free software; you can redistribute it and/or modify it under
8# the terms of the GNU General Public License as published by the Free
9# Software Foundation; either version 3, or (at your option) any later
10# version.
11#
12# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13# WARRANTY; without even the implied warranty of MERCHANTABILITY or
14# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15# for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with GCC; see the file COPYING3.  If not see
19# <http://www.gnu.org/licenses/>.  */
20
21import sys
22import os
23
24if len(sys.argv) != 2:
25    print("usage: %s <unicode version>", file=sys.stderr)
26    sys.exit(1)
27unicode_version = sys.argv[1]
28
29# Parse a codepoint in the format output by glibc tools.
30def parse_ucn(s):
31    if not (s.startswith("<U") and s.endswith(">")):
32        raise ValueError
33    return int(s[2:-1], base=16)
34
35# Process a line of width output from utf_gen.py and update global array.
36widths = [1] * (1 + 0x10FFFF)
37def process_width(line):
38    # Example lines:
39    # <UA8FF>	0
40    # <UA926>...<UA92D>	0
41
42    s = line.split()
43    width = int(s[1])
44    r = s[0].split("...")
45    if len(r) == 1:
46        begin = parse_ucn(r[0])
47        end = begin + 1
48    elif len(r) == 2:
49        begin = parse_ucn(r[0])
50        end = parse_ucn(r[1]) + 1
51    else:
52        raise ValueError
53    widths[begin:end] = [width] * (end - begin)
54
55# To keep things simple, we use glibc utf8_gen.py as-is.  It only outputs to a
56# file named UTF-8, which is not configurable.  Then we parse this into the form
57# we want it.
58os.system("from_glibc/utf8_gen.py --unicode_version %s" % unicode_version)
59processing = False
60for line in open("UTF-8", "r"):
61    if processing:
62        if line == "END WIDTH\n":
63            processing = False
64        else:
65            try:
66                process_width(line)
67            except (ValueError, IndexError):
68                print(e, "warning: ignored unexpected line: %s" % line,
69                        file=sys.stderr, end="")
70    elif line == "WIDTH\n":
71        processing = True
72
73# All bytes < 256 we treat as width 1.
74widths[0:255] = [1] * 255
75
76# Condense the list to contiguous ranges.
77cur_range = [-1, 1]
78all_ranges = []
79for i, width in enumerate(widths):
80    if width == cur_range[1]:
81        cur_range[0] = i
82    else:
83        all_ranges.append(cur_range)
84        cur_range = [i, width]
85
86# Output the arrays for generated_cpp_wcwidth.h
87print("/*  Generated by contrib/unicode/gen_wcwidth.py,",
88          "with the help of glibc's")
89print("    utf8_gen.py, using version %s" % unicode_version,
90          "of the Unicode standard.  */")
91print("\nstatic const cppchar_t wcwidth_range_ends[] = {", end="")
92for i, r in enumerate(all_ranges):
93    if i % 8:
94        print(" ", end="")
95    else:
96        print("\n  ", end="")
97    print("0x%x," % (r[0]), end="")
98print("\n};\n")
99print("static const unsigned char wcwidth_widths[] = {", end="")
100for i, r in enumerate(all_ranges):
101    if i % 24:
102        print(" ", end="")
103    else:
104        print("\n  ", end="")
105    print("%d," % r[1], end="")
106print("\n};")
107