1#!/usr/bin/python2 2# -*- coding: utf-8 -*- 3# 4# This script builds unaccent.rules on standard output when given the 5# contents of UnicodeData.txt [1] and Latin-ASCII.xml [2] given as 6# arguments. Optionally includes ligature expansion and Unicode CLDR 7# Latin-ASCII transliterator, enabled by default, this can be disabled 8# with "--no-ligatures-expansion" command line option. 9# 10# The approach is to use the Unicode decomposition data to identify 11# precomposed codepoints that are equivalent to a ligature of several 12# letters, or a base letter with any number of diacritical marks. 13# 14# This approach handles most letters with diacritical marks and some 15# ligatures. However, several characters (notably a majority of 16# ligatures) don't have decomposition. To handle all these cases, one can 17# use a standard Unicode transliterator available in Common Locale Data 18# Repository (CLDR): Latin-ASCII. This transliterator associates Unicode 19# characters to ASCII-range equivalent. Unless "--no-ligatures-expansion" 20# option is enabled, the XML file of this transliterator [2] -- given as a 21# command line argument -- will be parsed and used. 22# 23# [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt 24# [2] http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml 25 26 27import re 28import argparse 29import sys 30import xml.etree.ElementTree as ET 31 32def print_record(codepoint, letter): 33 print (unichr(codepoint) + "\t" + letter).encode("UTF-8") 34 35class Codepoint: 36 def __init__(self, id, general_category, combining_ids): 37 self.id = id 38 self.general_category = general_category 39 self.combining_ids = combining_ids 40 41def is_plain_letter(codepoint): 42 """Return true if codepoint represents a plain ASCII letter.""" 43 return (codepoint.id >= ord('a') and codepoint.id <= ord('z')) or \ 44 (codepoint.id >= ord('A') and codepoint.id <= ord('Z')) 45 46def is_mark(codepoint): 47 """Returns true for diacritical marks (combining codepoints).""" 48 return codepoint.general_category in ("Mn", "Me", "Mc") 49 50def is_letter_with_marks(codepoint, table): 51 """Returns true for letters combined with one or more marks.""" 52 # See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values 53 54 # Letter may have no combining characters, in which case it has 55 # no marks. 56 if len(codepoint.combining_ids) == 1: 57 return False 58 59 # A letter without diacritical marks has none of them. 60 if any(is_mark(table[i]) for i in codepoint.combining_ids[1:]) is False: 61 return False 62 63 # Check if the base letter of this letter has marks. 64 codepoint_base = codepoint.combining_ids[0] 65 if (is_plain_letter(table[codepoint_base]) is False and \ 66 is_letter_with_marks(table[codepoint_base], table) is False): 67 return False 68 69 return True 70 71def is_letter(codepoint, table): 72 """Return true for letter with or without diacritical marks.""" 73 return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table) 74 75def get_plain_letter(codepoint, table): 76 """Return the base codepoint without marks. If this codepoint has more 77 than one combining character, do a recursive lookup on the table to 78 find out its plain base letter.""" 79 if is_letter_with_marks(codepoint, table): 80 if len(table[codepoint.combining_ids[0]].combining_ids) > 1: 81 return get_plain_letter(table[codepoint.combining_ids[0]], table) 82 elif is_plain_letter(table[codepoint.combining_ids[0]]): 83 return table[codepoint.combining_ids[0]] 84 85 # Should not come here 86 assert(False) 87 elif is_plain_letter(codepoint): 88 return codepoint 89 90 # Should not come here 91 assert(False) 92 93def is_ligature(codepoint, table): 94 """Return true for letters combined with letters.""" 95 return all(is_letter(table[i], table) for i in codepoint.combining_ids) 96 97def get_plain_letters(codepoint, table): 98 """Return a list of plain letters from a ligature.""" 99 assert(is_ligature(codepoint, table)) 100 return [get_plain_letter(table[id], table) for id in codepoint.combining_ids] 101 102def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath): 103 """Parse the XML file and return a set of tuples (src, trg), where "src" 104 is the original character and "trg" the substitute.""" 105 charactersSet = set() 106 107 # RegEx to parse rules 108 rulePattern = re.compile(ur'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;') 109 110 # construct tree from XML 111 transliterationTree = ET.parse(latinAsciiFilePath) 112 transliterationTreeRoot = transliterationTree.getroot() 113 114 for rule in transliterationTreeRoot.findall("./transforms/transform/tRule"): 115 matches = rulePattern.search(rule.text) 116 117 # The regular expression capture four groups corresponding 118 # to the characters. 119 # 120 # Group 1: plain "src" char. Empty if group 2 is not. 121 # Group 2: unicode-escaped "src" char (e.g. "\u0110"). Empty if group 1 is not. 122 # 123 # Group 3: plain "trg" char. Empty if group 4 is not. 124 # Group 4: plain "trg" char between quotes. Empty if group 3 is not. 125 if matches is not None: 126 src = matches.group(1) if matches.group(1) is not None else matches.group(2).decode('unicode-escape') 127 trg = matches.group(3) if matches.group(3) is not None else matches.group(4) 128 129 # "'" and """ are escaped 130 trg = trg.replace("\\'", "'").replace('\\"', '"') 131 132 # the parser of unaccent only accepts non-whitespace characters 133 # for "src" and "trg" (see unaccent.c) 134 if not src.isspace() and not trg.isspace(): 135 charactersSet.add((ord(src), trg)) 136 137 return charactersSet 138 139def special_cases(): 140 """Returns the special cases which are not handled by other methods""" 141 charactersSet = set() 142 143 # Cyrillic 144 charactersSet.add((0x0401, u"\u0415")) # CYRILLIC CAPITAL LETTER IO 145 charactersSet.add((0x0451, u"\u0435")) # CYRILLIC SMALL LETTER IO 146 147 # Symbols of "Letterlike Symbols" Unicode Block (U+2100 to U+214F) 148 charactersSet.add((0x2103, u"\xb0C")) # DEGREE CELSIUS 149 charactersSet.add((0x2109, u"\xb0F")) # DEGREE FAHRENHEIT 150 charactersSet.add((0x2117, "(P)")) # SOUND RECORDING COPYRIGHT 151 152 return charactersSet 153 154def main(args): 155 # http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings 156 decomposition_type_pattern = re.compile(" *<[^>]*> *") 157 158 table = {} 159 all = [] 160 161 # unordered set for ensure uniqueness 162 charactersSet = set() 163 164 # read file UnicodeData.txt 165 unicodeDataFile = open(args.unicodeDataFilePath, 'r') 166 167 # read everything we need into memory 168 for line in unicodeDataFile: 169 fields = line.split(";") 170 if len(fields) > 5: 171 # http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt 172 general_category = fields[2] 173 decomposition = fields[5] 174 decomposition = re.sub(decomposition_type_pattern, ' ', decomposition) 175 id = int(fields[0], 16) 176 combining_ids = [int(s, 16) for s in decomposition.split(" ") if s != ""] 177 codepoint = Codepoint(id, general_category, combining_ids) 178 table[id] = codepoint 179 all.append(codepoint) 180 181 # walk through all the codepoints looking for interesting mappings 182 for codepoint in all: 183 if codepoint.general_category.startswith('L') and \ 184 len(codepoint.combining_ids) > 1: 185 if is_letter_with_marks(codepoint, table): 186 charactersSet.add((codepoint.id, 187 chr(get_plain_letter(codepoint, table).id))) 188 elif args.noLigaturesExpansion is False and is_ligature(codepoint, table): 189 charactersSet.add((codepoint.id, 190 "".join(unichr(combining_codepoint.id) 191 for combining_codepoint \ 192 in get_plain_letters(codepoint, table)))) 193 194 # add CLDR Latin-ASCII characters 195 if not args.noLigaturesExpansion: 196 charactersSet |= parse_cldr_latin_ascii_transliterator(args.latinAsciiFilePath) 197 charactersSet |= special_cases() 198 199 # sort for more convenient display 200 charactersList = sorted(charactersSet, key=lambda characterPair: characterPair[0]) 201 202 for characterPair in charactersList: 203 print_record(characterPair[0], characterPair[1]) 204 205if __name__ == "__main__": 206 parser = argparse.ArgumentParser(description='This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.') 207 parser.add_argument("--unicode-data-file", help="Path to formatted text file corresponding to UnicodeData.txt. See <http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt>.", type=str, required=True, dest='unicodeDataFilePath') 208 parser.add_argument("--latin-ascii-file", help="Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml). See <http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml>.", type=str, dest='latinAsciiFilePath') 209 parser.add_argument("--no-ligatures-expansion", help="Do not expand ligatures and do not use Unicode CLDR Latin-ASCII transliterator. By default, this option is not enabled and \"--latin-ascii-file\" argument is required. If this option is enabled, \"--latin-ascii-file\" argument is optional and ignored.", action="store_true", dest='noLigaturesExpansion') 210 args = parser.parse_args() 211 212 if args.noLigaturesExpansion is False and args.latinAsciiFilePath is None: 213 sys.stderr.write('You must specify the path to Latin-ASCII transliterator file with \"--latin-ascii-file\" option or use \"--no-ligatures-expansion\" option. Use \"-h\" option for help.') 214 sys.exit(1) 215 216 main(args) 217