1#!/usr/bin/python2 2# -*- coding: utf-8 -*- 3# 4# This script builds unaccent.rules on standard output when given the 5# contents of UnicodeData.txt [1] and Latin-ASCII.xml [2] given as 6# arguments. Optionally includes ligature expansion and Unicode CLDR 7# Latin-ASCII transliterator, enabled by default, this can be disabled 8# with "--no-ligatures-expansion" command line option. 9# 10# The approach is to use the Unicode decomposition data to identify 11# precomposed codepoints that are equivalent to a ligature of several 12# letters, or a base letter with any number of diacritical marks. 13# 14# This approach handles most letters with diacritical marks and some 15# ligatures. However, several characters (notably a majority of 16# ligatures) don't have decomposition. To handle all these cases, one can 17# use a standard Unicode transliterator available in Common Locale Data 18# Repository (CLDR): Latin-ASCII. This transliterator associates Unicode 19# characters to ASCII-range equivalent. Unless "--no-ligatures-expansion" 20# option is enabled, the XML file of this transliterator [2] -- given as a 21# command line argument -- will be parsed and used. 22# 23# [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt 24# [2] http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml 25 26 27import re 28import argparse 29import sys 30import xml.etree.ElementTree as ET 31 32def print_record(codepoint, letter): 33 print (unichr(codepoint) + "\t" + letter).encode("UTF-8") 34 35class Codepoint: 36 def __init__(self, id, general_category, combining_ids): 37 self.id = id 38 self.general_category = general_category 39 self.combining_ids = combining_ids 40 41def is_plain_letter(codepoint): 42 """Return true if codepoint represents a plain ASCII letter.""" 43 return (codepoint.id >= ord('a') and codepoint.id <= ord('z')) or \ 44 (codepoint.id >= ord('A') and codepoint.id <= ord('Z')) 45 46def is_mark(codepoint): 47 """Returns true for diacritical marks (combining codepoints).""" 48 return codepoint.general_category in ("Mn", "Me", "Mc") 49 50def is_letter_with_marks(codepoint, table): 51 """Returns true for plain letters combined with one or more marks.""" 52 # See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values 53 return len(codepoint.combining_ids) > 1 and \ 54 is_plain_letter(table[codepoint.combining_ids[0]]) and \ 55 all(is_mark(table[i]) for i in codepoint.combining_ids[1:]) 56 57def is_letter(codepoint, table): 58 """Return true for letter with or without diacritical marks.""" 59 return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table) 60 61def get_plain_letter(codepoint, table): 62 """Return the base codepoint without marks.""" 63 if is_letter_with_marks(codepoint, table): 64 return table[codepoint.combining_ids[0]] 65 elif is_plain_letter(codepoint): 66 return codepoint 67 else: 68 raise "mu" 69 70def is_ligature(codepoint, table): 71 """Return true for letters combined with letters.""" 72 return all(is_letter(table[i], table) for i in codepoint.combining_ids) 73 74def get_plain_letters(codepoint, table): 75 """Return a list of plain letters from a ligature.""" 76 assert(is_ligature(codepoint, table)) 77 return [get_plain_letter(table[id], table) for id in codepoint.combining_ids] 78 79def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath): 80 """Parse the XML file and return a set of tuples (src, trg), where "src" 81 is the original character and "trg" the substitute.""" 82 charactersSet = set() 83 84 # RegEx to parse rules 85 rulePattern = re.compile(ur'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;') 86 87 # construct tree from XML 88 transliterationTree = ET.parse(latinAsciiFilePath) 89 transliterationTreeRoot = transliterationTree.getroot() 90 91 for rule in transliterationTreeRoot.findall("./transforms/transform/tRule"): 92 matches = rulePattern.search(rule.text) 93 94 # The regular expression capture four groups corresponding 95 # to the characters. 96 # 97 # Group 1: plain "src" char. Empty if group 2 is not. 98 # Group 2: unicode-escaped "src" char (e.g. "\u0110"). Empty if group 1 is not. 99 # 100 # Group 3: plain "trg" char. Empty if group 4 is not. 101 # Group 4: plain "trg" char between quotes. Empty if group 3 is not. 102 if matches is not None: 103 src = matches.group(1) if matches.group(1) is not None else matches.group(2).decode('unicode-escape') 104 trg = matches.group(3) if matches.group(3) is not None else matches.group(4) 105 106 # "'" and """ are escaped 107 trg = trg.replace("\\'", "'").replace('\\"', '"') 108 109 # the parser of unaccent only accepts non-whitespace characters 110 # for "src" and "trg" (see unaccent.c) 111 if not src.isspace() and not trg.isspace(): 112 charactersSet.add((ord(src), trg)) 113 114 return charactersSet 115 116def special_cases(): 117 """Returns the special cases which are not handled by other methods""" 118 charactersSet = set() 119 120 # Cyrillic 121 charactersSet.add((0x0401, u"\u0415")) # CYRILLIC CAPITAL LETTER IO 122 charactersSet.add((0x0451, u"\u0435")) # CYRILLIC SMALL LETTER IO 123 124 # Symbols of "Letterlike Symbols" Unicode Block (U+2100 to U+214F) 125 charactersSet.add((0x2103, u"\xb0C")) # DEGREE CELSIUS 126 charactersSet.add((0x2109, u"\xb0F")) # DEGREE FAHRENHEIT 127 charactersSet.add((0x2117, "(P)")) # SOUND RECORDING COPYRIGHT 128 129 return charactersSet 130 131def main(args): 132 # http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings 133 decomposition_type_pattern = re.compile(" *<[^>]*> *") 134 135 table = {} 136 all = [] 137 138 # unordered set for ensure uniqueness 139 charactersSet = set() 140 141 # read file UnicodeData.txt 142 unicodeDataFile = open(args.unicodeDataFilePath, 'r') 143 144 # read everything we need into memory 145 for line in unicodeDataFile: 146 fields = line.split(";") 147 if len(fields) > 5: 148 # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt 149 general_category = fields[2] 150 decomposition = fields[5] 151 decomposition = re.sub(decomposition_type_pattern, ' ', decomposition) 152 id = int(fields[0], 16) 153 combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""] 154 codepoint = Codepoint(id, general_category, combining_ids) 155 table[id] = codepoint 156 all.append(codepoint) 157 158 # walk through all the codepoints looking for interesting mappings 159 for codepoint in all: 160 if codepoint.general_category.startswith('L') and \ 161 len(codepoint.combining_ids) > 1: 162 if is_letter_with_marks(codepoint, table): 163 charactersSet.add((codepoint.id, 164 chr(get_plain_letter(codepoint, table).id))) 165 elif args.noLigaturesExpansion is False and is_ligature(codepoint, table): 166 charactersSet.add((codepoint.id, 167 "".join(unichr(combining_codepoint.id) 168 for combining_codepoint \ 169 in get_plain_letters(codepoint, table)))) 170 171 # add CLDR Latin-ASCII characters 172 if not args.noLigaturesExpansion: 173 charactersSet |= parse_cldr_latin_ascii_transliterator(args.latinAsciiFilePath) 174 charactersSet |= special_cases() 175 176 # sort for more convenient display 177 charactersList = sorted(charactersSet, key=lambda characterPair: characterPair[0]) 178 179 for characterPair in charactersList: 180 print_record(characterPair[0], characterPair[1]) 181 182if __name__ == "__main__": 183 parser = argparse.ArgumentParser(description='This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.') 184 parser.add_argument("--unicode-data-file", help="Path to formatted text file corresponding to UnicodeData.txt. See <http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt>.", type=str, required=True, dest='unicodeDataFilePath') 185 parser.add_argument("--latin-ascii-file", help="Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml). See <http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml>.", type=str, dest='latinAsciiFilePath') 186 parser.add_argument("--no-ligatures-expansion", help="Do not expand ligatures and do not use Unicode CLDR Latin-ASCII transliterator. By default, this option is not enabled and \"--latin-ascii-file\" argument is required. If this option is enabled, \"--latin-ascii-file\" argument is optional and ignored.", action="store_true", dest='noLigaturesExpansion') 187 args = parser.parse_args() 188 189 if args.noLigaturesExpansion is False and args.latinAsciiFilePath is None: 190 sys.stderr.write('You must specify the path to Latin-ASCII transliterator file with \"--latin-ascii-file\" option or use \"--no-ligatures-expansion\" option. Use \"-h\" option for help.') 191 sys.exit(1) 192 193 main(args) 194