1#!/usr/bin/env python3 2# 3# Script to generate tables for cpp_wcwidth, leveraging glibc's utf8_gen.py. 4# 5# This file is part of GCC. 6# 7# GCC is free software; you can redistribute it and/or modify it under 8# the terms of the GNU General Public License as published by the Free 9# Software Foundation; either version 3, or (at your option) any later 10# version. 11# 12# GCC is distributed in the hope that it will be useful, but WITHOUT ANY 13# WARRANTY; without even the implied warranty of MERCHANTABILITY or 14# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15# for more details. 16# 17# You should have received a copy of the GNU General Public License 18# along with GCC; see the file COPYING3. If not see 19# <http://www.gnu.org/licenses/>. */ 20 21import sys 22import os 23 24if len(sys.argv) != 2: 25 print("usage: %s <unicode version>", file=sys.stderr) 26 sys.exit(1) 27unicode_version = sys.argv[1] 28 29# Parse a codepoint in the format output by glibc tools. 30def parse_ucn(s): 31 if not (s.startswith("<U") and s.endswith(">")): 32 raise ValueError 33 return int(s[2:-1], base=16) 34 35# Process a line of width output from utf_gen.py and update global array. 36widths = [1] * (1 + 0x10FFFF) 37def process_width(line): 38 # Example lines: 39 # <UA8FF> 0 40 # <UA926>...<UA92D> 0 41 42 s = line.split() 43 width = int(s[1]) 44 r = s[0].split("...") 45 if len(r) == 1: 46 begin = parse_ucn(r[0]) 47 end = begin + 1 48 elif len(r) == 2: 49 begin = parse_ucn(r[0]) 50 end = parse_ucn(r[1]) + 1 51 else: 52 raise ValueError 53 widths[begin:end] = [width] * (end - begin) 54 55# To keep things simple, we use glibc utf8_gen.py as-is. It only outputs to a 56# file named UTF-8, which is not configurable. Then we parse this into the form 57# we want it. 58os.system("from_glibc/utf8_gen.py --unicode_version %s" % unicode_version) 59processing = False 60for line in open("UTF-8", "r"): 61 if processing: 62 if line == "END WIDTH\n": 63 processing = False 64 else: 65 try: 66 process_width(line) 67 except (ValueError, IndexError): 68 print(e, "warning: ignored unexpected line: %s" % line, 69 file=sys.stderr, end="") 70 elif line == "WIDTH\n": 71 processing = True 72 73# All bytes < 256 we treat as width 1. 74widths[0:255] = [1] * 255 75 76# Condense the list to contiguous ranges. 77cur_range = [-1, 1] 78all_ranges = [] 79for i, width in enumerate(widths): 80 if width == cur_range[1]: 81 cur_range[0] = i 82 else: 83 all_ranges.append(cur_range) 84 cur_range = [i, width] 85 86# Output the arrays for generated_cpp_wcwidth.h 87print("/* Generated by contrib/unicode/gen_wcwidth.py,", 88 "with the help of glibc's") 89print(" utf8_gen.py, using version %s" % unicode_version, 90 "of the Unicode standard. */") 91print("\nstatic const cppchar_t wcwidth_range_ends[] = {", end="") 92for i, r in enumerate(all_ranges): 93 if i % 8: 94 print(" ", end="") 95 else: 96 print("\n ", end="") 97 print("0x%x," % (r[0]), end="") 98print("\n};\n") 99print("static const unsigned char wcwidth_widths[] = {", end="") 100for i, r in enumerate(all_ranges): 101 if i % 24: 102 print(" ", end="") 103 else: 104 print("\n ", end="") 105 print("%d," % r[1], end="") 106print("\n};") 107