contrib/unicode/gen_wcwidth.py

#!/usr/bin/env python3
#
# Script to generate tables for cpp_wcwidth, leveraging glibc's utf8_gen.py.
#
# This file is part of GCC.
#
# GCC is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 3, or (at your option) any later
# version.
#
# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
# for more details.
#
# You should have received a copy of the GNU General Public License
# along with GCC; see the file COPYING3.  If not see
# <http://www.gnu.org/licenses/>.  */

import sys
import os

if len(sys.argv) != 2:
    print("usage: %s <unicode version>", file=sys.stderr)
    sys.exit(1)
unicode_version = sys.argv[1]

# Parse a codepoint in the format output by glibc tools.
def parse_ucn(s):
    if not (s.startswith("<U") and s.endswith(">")):
        raise ValueError
    return int(s[2:-1], base=16)

# Process a line of width output from utf_gen.py and update global array.
widths = [1] * (1 + 0x10FFFF)
def process_width(line):
    # Example lines:
    # <UA8FF>	0
    # <UA926>...<UA92D>	0

    s = line.split()
    width = int(s[1])
    r = s[0].split("...")
    if len(r) == 1:
        begin = parse_ucn(r[0])
        end = begin + 1
    elif len(r) == 2:
        begin = parse_ucn(r[0])
        end = parse_ucn(r[1]) + 1
    else:
        raise ValueError
    widths[begin:end] = [width] * (end - begin)

# To keep things simple, we use glibc utf8_gen.py as-is.  It only outputs to a
# file named UTF-8, which is not configurable.  Then we parse this into the form
# we want it.
os.system("from_glibc/utf8_gen.py --unicode_version %s" % unicode_version)
processing = False
for line in open("UTF-8", "r"):
    if processing:
        if line == "END WIDTH\n":
            processing = False
        else:
            try:
                process_width(line)
            except (ValueError, IndexError):
                print(e, "warning: ignored unexpected line: %s" % line,
                        file=sys.stderr, end="")
    elif line == "WIDTH\n":
        processing = True

# All bytes < 256 we treat as width 1.
widths[0:255] = [1] * 255

# Condense the list to contiguous ranges.
cur_range = [-1, 1]
all_ranges = []
for i, width in enumerate(widths):
    if width == cur_range[1]:
        cur_range[0] = i
    else:
        all_ranges.append(cur_range)
        cur_range = [i, width]

# Output the arrays for generated_cpp_wcwidth.h
print("/*  Generated by contrib/unicode/gen_wcwidth.py,",
          "with the help of glibc's")
print("    utf8_gen.py, using version %s" % unicode_version,
          "of the Unicode standard.  */")
print("\nstatic const cppchar_t wcwidth_range_ends[] = {", end="")
for i, r in enumerate(all_ranges):
    if i % 8:
        print(" ", end="")
    else:
        print("\n  ", end="")
    print("0x%x," % (r[0]), end="")
print("\n};\n")
print("static const unsigned char wcwidth_widths[] = {", end="")
for i, r in enumerate(all_ranges):
    if i % 24:
        print(" ", end="")
    else:
        print("\n  ", end="")
    print("%d," % r[1], end="")
print("\n};")