1#!/usr/bin/env python3 2# 3# Copyright 2011-2015 The Rust Project Developers. See the COPYRIGHT 4# file at the top-level directory of this distribution and at 5# http://rust-lang.org/COPYRIGHT. 6# 7# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 8# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 9# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 10# option. This file may not be copied, modified, or distributed 11# except according to those terms. 12 13# This script uses the following Unicode security tables: 14# - IdentifierStatus.txt 15# - IdentifierType.txt 16# - PropertyValueAliases.txt 17# - confusables.txt 18# - ReadMe.txt 19# This script also uses the following Unicode UCD data: 20# - Scripts.txt 21# 22# Since this should not require frequent updates, we just store this 23# out-of-line and check the tables.rs file into git. 24 25import fileinput, re, os, sys, operator 26 27preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT 28// file at the top-level directory of this distribution and at 29// http://rust-lang.org/COPYRIGHT. 30// 31// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 32// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 33// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 34// option. This file may not be copied, modified, or distributed 35// except according to those terms. 36 37// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly 38 39#![allow(missing_docs, non_upper_case_globals, non_snake_case)] 40''' 41 42UNICODE_VERSION = (13, 0, 0) 43 44UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION 45 46# Download a Unicode security table file 47def fetch(f): 48 if not os.path.exists(os.path.basename(f)): 49 os.system("curl -O http://www.unicode.org/Public/security/%s/%s" 50 % (UNICODE_VERSION_NUMBER, f)) 51 52 if not os.path.exists(os.path.basename(f)): 53 sys.stderr.write("cannot load %s\n" % f) 54 exit(1) 55 56# Download a UCD table file 57def fetch_unidata(f): 58 if not os.path.exists(os.path.basename(f)): 59 os.system("curl -O http://www.unicode.org/Public/%s/ucd/%s" 60 % (UNICODE_VERSION_NUMBER, f)) 61 62 if not os.path.exists(os.path.basename(f)): 63 sys.stderr.write("cannot load %s" % f) 64 exit(1) 65 66# Loads code point data from IdentifierStatus.txt and 67# IdentifierType.txt 68# Implementation from unicode-segmentation 69def load_properties(f, interestingprops = None): 70 fetch(f) 71 props = {} 72 re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)") 73 re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)") 74 75 for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")): 76 prop = None 77 d_lo = 0 78 d_hi = 0 79 m = re1.match(line) 80 if m: 81 d_lo = m.group(1) 82 d_hi = m.group(1) 83 prop = m.group(2).strip() 84 else: 85 m = re2.match(line) 86 if m: 87 d_lo = m.group(1) 88 d_hi = m.group(2) 89 prop = m.group(3).strip() 90 else: 91 continue 92 if interestingprops and prop not in interestingprops: 93 continue 94 d_lo = int(d_lo, 16) 95 d_hi = int(d_hi, 16) 96 if prop not in props: 97 props[prop] = [] 98 props[prop].append((d_lo, d_hi)) 99 100 return props 101 102# Loads script data from Scripts.txt 103def load_script_properties(f, interestingprops): 104 fetch_unidata(f) 105 props = {} 106 # Note: these regexes are different from those in unicode-segmentation, 107 # becase we need to handle spaces here 108 re1 = re.compile(r"^ *([0-9A-F]+) *; *([^#]+) *#") 109 re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#]+) *#") 110 111 for line in fileinput.input(os.path.basename(f)): 112 prop = None 113 d_lo = 0 114 d_hi = 0 115 m = re1.match(line) 116 if m: 117 d_lo = m.group(1) 118 d_hi = m.group(1) 119 prop = m.group(2).strip() 120 else: 121 m = re2.match(line) 122 if m: 123 d_lo = m.group(1) 124 d_hi = m.group(2) 125 prop = m.group(3).strip() 126 else: 127 continue 128 if interestingprops and prop not in interestingprops: 129 continue 130 d_lo = int(d_lo, 16) 131 d_hi = int(d_hi, 16) 132 if prop not in props: 133 props[prop] = [] 134 props[prop].append((d_lo, d_hi)) 135 136 return props 137 138# Loads confusables data from confusables.txt 139def load_confusables(f): 140 fetch(f) 141 confusables = [] 142 re1 = re.compile(r"^((?:[0-9A-F]+ )+);\t((?:[0-9A-F]+ )+);\t\w*") 143 144 for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")): 145 d_input = 0 146 d_outputs = [] 147 m = re1.match(line) 148 if not m: 149 continue 150 d_inputs = m.group(1).split() 151 if len(d_inputs) != 1: 152 raise Exception('More than one code point in first column') 153 d_input = int(d_inputs[0].strip(), 16) 154 for d_output in m.group(2).split(): 155 d_outputitem = int(d_output, 16) 156 d_outputs.append(d_outputitem) 157 confusables.append((d_input, d_outputs)) 158 159 return confusables 160 161# Loads Unicode script name correspondence from PropertyValueAliases.txt 162def aliases(): 163 # This function is taken from the `unicode-script` crate. If significant 164 # changes are introduced, update accordingly. 165 166 # Note that this file is in UCD directly, not security directory. 167 # we use `fetch_unidata` function to download it. 168 fetch_unidata("PropertyValueAliases.txt") 169 longforms = {} 170 shortforms = {} 171 re1 = re.compile(r"^ *sc *; *(\w+) *; *(\w+)") 172 for line in fileinput.input(os.path.basename("PropertyValueAliases.txt")): 173 m = re1.match(line) 174 if m: 175 l = m.group(2).strip() 176 s = m.group(1).strip() 177 assert(s not in longforms) 178 assert(l not in shortforms) 179 longforms[s] = l 180 shortforms[l] = s 181 else: 182 continue 183 184 return (longforms, shortforms) 185 186# Loads Unicode script name list and correspondence mapping 187def load_scripts(f): 188 # This function is taken from the `unicode-script` crate. If significant 189 # changes are introduced, update accordingly. 190 191 (longforms, shortforms) = aliases() 192 scripts = load_script_properties(f, []) 193 194 script_table = [] 195 script_list = [] 196 197 for script in scripts: 198 if script not in ["Common", "Unknown", "Inherited"]: 199 script_list.append(shortforms[script]) 200 script_table.extend([(x, y, shortforms[script]) for (x, y) in scripts[script]]) 201 script_list.sort() 202 script_table.sort(key=lambda w: w[0]) 203 return (longforms, script_table) 204 205def is_script_ignored_in_mixedscript(source): 206 return source == 'Zinh' or source == 'Zyyy' or source == 'Zzzz' 207 208# When a codepoint's prototype consists of multiple codepoints. 209# The situation is more complex. Here we make up a few rules 210# to cover all the cases in confusables.txt . 211# The principle is that when replacing the original codepoint with its prototype. 212# Neither a "non-ignored script" appears nor it disappears. 213# 214# We make up several rules to cover the cases occurred within confusables.txt 215# Return True, True when we want to consider it confusable, 216# and return True, False when we want to consider it non-confusable. 217# and return False, _ when new not-yet-processed cases are added in future Unicode versions. 218def process_mixedscript_single_to_multi(item_i, script_i, proto_lst, scripts): 219 script_lst = script_list(proto_lst, scripts) 220 script_lst.sort() 221 # here's a few rules to process current version of Unicode data (13.0 at this time) 222 script_lst_len = len(script_lst) 223 assert(script_lst_len > 0) 224 # Rule: A - A -> Processed, DontAdd 225 if script_lst_len == 1 and script_lst[0] == script_i: 226 return True, False 227 # Rule: A(not in (Zinh, Zyyy, Zzzz)) - B(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add 228 if (script_lst_len == 1 and not is_script_ignored_in_mixedscript(script_lst[0]) 229 and not is_script_ignored_in_mixedscript(script_i) 230 and script_lst[0] != script_i): 231 return True, True 232 # Rule: (Zinh | Zyyy | Zzzz) - A(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add 233 if (script_lst_len == 1 and is_script_ignored_in_mixedscript(script_lst[0]) 234 and not is_script_ignored_in_mixedscript(script_i)): 235 return True, True 236 # Rule: A ... - A -> Processed, DontAdd 237 if script_lst_len > 1 and script_i in script_lst: 238 return True, False 239 # Rule: (Zinh | Zyyy | Zzzz) A(not in (Zinh, Zyyy, Zzzz)) - B(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add 240 if (script_lst_len == 2 and is_script_ignored_in_mixedscript(script_lst[0]) 241 and not is_script_ignored_in_mixedscript(script_lst[1]) 242 and not is_script_ignored_in_mixedscript(script_i) 243 and script_lst[1] != script_i): 244 return True, True 245 if (script_lst_len == 2 and is_script_ignored_in_mixedscript(script_lst[1]) 246 and not is_script_ignored_in_mixedscript(script_lst[0]) 247 and not is_script_ignored_in_mixedscript(script_i) 248 and script_lst[0] != script_i): 249 return True, True 250 # Rule: (Zinh | Zyyy | Zzzz) (Zinh | Zyyy | Zzzz) - A(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add 251 if (script_lst_len == 2 and is_script_ignored_in_mixedscript(script_lst[0]) 252 and is_script_ignored_in_mixedscript(script_lst[1]) 253 and not is_script_ignored_in_mixedscript(script_i)): 254 return True, True 255 256 # NotProcessed, DontAdd 257 return False, False 258 259def is_codepoint_identifier_allowed(c, identifier_allowed): 260 for data in identifier_allowed: 261 if c >= data[0] and c <= data[1]: 262 return True 263 return False 264 265# This function load and generates a table of all the confusable characters. 266# It returns a pair consists of a `mixedscript_confusable` table and a 267# `mixedscript_confusable_unresolved` table. 268# The `mixedscript_confusable` is a dict, its keys are Unicode script names, and each 269# entry has a value of a inner dict. The inner dict's keys are confusable code points 270# converted to string with the `escape_char` function, and its values are pairs. 271# pair[0] keeps a copy of the confusable code point itself but as integer. 272# pair[1] keeps a list of all the code points that are mixed script confusable with it. 273# which is only used for debugging purposes. 274# note that the string 'multi' will occur in the list when pair[0] is considered 275# confusable with its multiple code point prototype. 276# Usually the `mixedscript_confusable_unresolved` table is empty, but it's possible 277# that future Unicode version update may cause that table become nonempty, in which 278# case more rules needs to be added to the `process_mixedscript_single_to_multi` function 279# above to cover those new cases. 280def load_potential_mixedscript_confusables(f, identifier_allowed, scripts): 281 # First, load all confusables data from confusables.txt 282 confusables = load_confusables(f) 283 284 # The confusables.txt is reductive, means that it is intended to be used in 285 # on the fly substitutions. The code points that didn't occur in the file can be 286 # seen as substitutes to itself. So if the confusables.txt says A -> C, B -> C, 287 # and implicitly C -> C, it means A <-> B, A <-> C, B <-> C are confusable. 288 289 # Here we're dividing all confusable lhs and rhs(prototype) operands of the substitution into equivalence classes. 290 # Principally we'll be using the rhs operands as the representive element of its equivalence classes. 291 # However some rhs operands are single code point, while some others are not. 292 # Here we collect them separately into `codepoint_map` and `multicodepoint_map`. 293 codepoint_map = {} 294 multicodepoint_map = {} 295 for item in confusables: 296 d_source = item[0] 297 # According to the RFC, we'll skip those code points that are restricted from identifier usage. 298 if not is_codepoint_identifier_allowed(d_source, identifier_allowed): 299 continue 300 d_proto_list = item[1] 301 if len(d_proto_list) == 1: 302 d_proto = escape_char(d_proto_list[0]) 303 # we use the escaped representation of rhs as key to the dict when creating new equivalence class. 304 if d_proto not in codepoint_map: 305 codepoint_map[d_proto] = [] 306 # when we create new equivalence class, we'll check whether the representative element should be collected. 307 # i.e. if it is not restricted from identifier usage, we collect it into the equivalence class. 308 if is_codepoint_identifier_allowed(d_proto_list[0], identifier_allowed): 309 codepoint_map[d_proto].append(d_proto_list[0]) 310 # we collect the original code point to be substituted into this list. 311 codepoint_map[d_proto].append(d_source) 312 else: 313 d_protos = escape_char_list(d_proto_list) 314 # difference in multi code point case: the rhs part is not directly usable, however we store it in 315 # dict for further special examination between each lhs and this multi code point rhs. 316 # and there's an extra level of tuple here. 317 if d_protos not in multicodepoint_map: 318 multicodepoint_map[d_protos] = (d_proto_list, []) 319 multicodepoint_map[d_protos][1].append(d_source) 320 321 mixedscript_confusable = {} 322 323 def confusable_entry_item(confusable, script, item_text, item): 324 if script not in confusable: 325 confusable[script] = {} 326 script_entry = confusable[script] 327 if item_text not in script_entry: 328 script_entry[item_text] = (item, []) 329 return script_entry[item_text][1] 330 331 # First let's examine the each code point having single code point prototype case. 332 for _, source in codepoint_map.items(): 333 source_len = len(source) 334 # Examine each pair in the equivalence class 335 for i in range(0, source_len - 1): 336 for j in range(i + 1, source_len): 337 item_i, item_j = source[i], source[j] 338 script_i, script_j = codepoint_script(item_i, scripts), codepoint_script(item_j, scripts) 339 # If they're in the same script, just skip this pair. 340 if script_i == script_j: 341 continue 342 # If `item_i` (the first) is not in a non-ignored script, and `item_j` (the second) is in a differnt one (maybe ignored), 343 # this means that this usage of the `item_i` can be suspicious, when it occurs in a document that is written in `script_j`. 344 # We'll consider it a mixed_script_confusable code point. 345 if not is_script_ignored_in_mixedscript(script_i): 346 # store it within the map, saving as much information as possible, for further investigation on the final results. 347 confusable_entry_item(mixedscript_confusable, script_i, escape_char(item_i), item_i).append(item_j) 348 # Do the same in reverse from `item_j` to `item_i` 349 if not is_script_ignored_in_mixedscript(script_j): 350 confusable_entry_item(mixedscript_confusable, script_j, escape_char(item_j), item_j).append(item_i) 351 352 # Then let's examine the each code point having multiple code point prototype case. 353 # We'll check between the code points that shares the same prototype 354 for _, proto_lst_and_source in multicodepoint_map.items(): 355 source = proto_lst_and_source[1] 356 source_len = len(source) 357 # This is basically the same as the single code point case. 358 for i in range(0, source_len - 1): 359 for j in range(i + 1, source_len): 360 item_i, item_j = source[i], source[j] 361 script_i, script_j = codepoint_script(item_i, scripts), codepoint_script(item_j, scripts) 362 if script_i == script_j: 363 continue 364 if not is_script_ignored_in_mixedscript(script_i): 365 confusable_entry_item(mixedscript_confusable, script_i, escape_char(item_i), item_i).append(item_j) 366 if not is_script_ignored_in_mixedscript(script_j): 367 confusable_entry_item(mixedscript_confusable, script_j, escape_char(item_j), item_j).append(item_i) 368 369 mixedscript_confusable_unresolved = {} 370 # We'll also check between each code points and its multiple codepoint prototype 371 for _, proto_lst_and_source in multicodepoint_map.items(): 372 proto_lst = proto_lst_and_source[0] 373 proto_lst_can_be_part_of_identifier = True 374 # If the prototype contains one or more restricted code point, then we skip it. 375 for c in proto_lst: 376 if not is_codepoint_identifier_allowed(c, identifier_allowed): 377 proto_lst_can_be_part_of_identifier = False 378 break 379 if not proto_lst_can_be_part_of_identifier: 380 continue 381 source = proto_lst_and_source[1] 382 source_len = len(source) 383 for i in range(0, source_len): 384 item_i = source[i] 385 # So here we're just checking whether the single code point should be considered confusable. 386 script_i = codepoint_script(item_i, scripts) 387 # If it's in ignored script, we don't need to do anything here. 388 if is_script_ignored_in_mixedscript(script_i): 389 continue 390 # Here're some rules on examining whether the single code point should be considered confusable. 391 # The principle is that, when subsitution happens, no new non-ignored script are introduced, and its 392 # own script is not lost. 393 processed, should_add = process_mixedscript_single_to_multi(item_i, script_i, proto_lst, scripts) 394 if should_add: 395 assert(processed) 396 # Mark the single code point as confusable. 397 confusable_entry_item(mixedscript_confusable, script_i, escape_char(item_i), item_i).append('multi') 398 if processed: 399 # Finished dealing with this code point. 400 continue 401 # If it's not processed we must be dealing with a newer version Unicode data, which introduced some significant 402 # changes. We don't throw an exception here, instead we collect it into a table for debugging purpose, and throw 403 # an exception after we returned and printed the table out. 404 proto_lst_text = escape_char_list(proto_lst) 405 if not proto_lst_text in mixedscript_confusable_unresolved: 406 mixedscript_confusable_unresolved[proto_lst_text] = (proto_lst, []) 407 mixedscript_confusable_unresolved[proto_lst_text][1].append(item_i) 408 return (mixedscript_confusable, mixedscript_confusable_unresolved) 409 410def codepoint_script(c, scripts): 411 for x, y, script in scripts: 412 if c >= x and c <= y: 413 return script 414 raise Exception("Not in scripts: " + escape_char(c)) 415 416# Emit some useful information for debugging when further update happens. 417def debug_emit_mixedscript_confusable(f, mixedscript_confusable, text, scripts): 418 f.write("/* " + text + "\n") 419 for script, lst in mixedscript_confusable.items(): 420 f.write("/// Script - " + script + "\n") 421 source_lst = [v[0] for (_, v) in lst.items()] 422 source_lst.sort() 423 for source in source_lst: 424 source_text = escape_char(source) 425 source_item_and_target_lst = lst[source_text] 426 target_lst = source_item_and_target_lst[1] 427 f.write(source_text + " => " + escape_char_list(target_lst) + " // " + escape_script_list(target_lst, scripts)+ "\n") 428 f.write("*/\n") 429 430 431def script_list(char_lst, scripts): 432 script_lst = [] 433 for c in char_lst: 434 if c == 'multi': 435 script = 'Z~multi' 436 else: 437 script = codepoint_script(c, scripts) 438 if script not in script_lst: 439 script_lst.append(script) 440 return script_lst 441 442def escape_script_list(char_lst, scripts): 443 script_lst = script_list(char_lst, scripts) 444 script_lst.sort() 445 return str(script_lst) 446 447def debug_emit_mixedscript_confusable_unresolved(f, map, text, scripts): 448 if len(map) == 0: 449 return 450 print("// " + text + "\n") 451 for prototype_text, pair in map.items(): 452 prototype = pair[0] 453 source = pair[1] 454 print(prototype_text + " => " + escape_char_list(source) + " // " + escape_script_list(prototype, scripts) + " => " + escape_script_list(source, scripts) + "\n") 455 raise Exception("update the python script to add new rules for new data") 456 457def format_table_content(f, content, indent): 458 line = " "*indent 459 first = True 460 for chunk in content.split(","): 461 if len(line) + len(chunk) < 98: 462 if first: 463 line += chunk 464 else: 465 line += ", " + chunk 466 first = False 467 else: 468 f.write(line + ",\n") 469 line = " "*indent + chunk 470 f.write(line) 471 472def escape_char(c): 473 if c == 'multi': 474 return "\"<multiple code points>\"" 475 return "'\\u{%x}'" % c 476 477def escape_char_list(l): 478 line = "[" 479 first = True 480 for c in l: 481 if first: 482 line += escape_char(c) 483 else: 484 line += ", " + escape_char(c) 485 first = False 486 line += "]" 487 return line 488 489def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True, 490 pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True): 491 pub_string = "const" 492 if not is_const: 493 pub_string = "let" 494 if is_pub: 495 pub_string = "pub " + pub_string 496 f.write(" %s %s: %s = &[\n" % (pub_string, name, t_type)) 497 data = "" 498 first = True 499 for dat in t_data: 500 if not first: 501 data += "," 502 first = False 503 data += pfun(dat) 504 format_table_content(f, data, 8) 505 f.write("\n ];\n\n") 506 507def emit_identifier_module(f): 508 f.write("pub mod identifier {") 509 f.write(""" 510 511 #[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)] 512 #[allow(non_camel_case_types)] 513 /// https://www.unicode.org/reports/tr39/#Identifier_Status_and_Type 514 pub enum IdentifierType { 515 // Restricted 516 Not_Character, 517 Deprecated, 518 Default_Ignorable, 519 Not_NFKC, 520 Not_XID, 521 Exclusion, 522 Obsolete, 523 Technical, 524 Uncommon_Use, 525 Limited_Use, 526 527 // Allowed 528 Inclusion, 529 Recommended 530 } 531 #[inline] 532 pub fn identifier_status_allowed(c: char) -> bool { 533 // FIXME: do we want to special case ASCII here? 534 match c as usize { 535 _ => super::util::bsearch_range_table(c, IDENTIFIER_STATUS) 536 } 537 } 538 539 #[inline] 540 pub fn identifier_type(c: char) -> Option<IdentifierType> { 541 // FIXME: do we want to special case ASCII here? 542 match c as usize { 543 _ => super::util::bsearch_range_value_table(c, IDENTIFIER_TYPE) 544 } 545 } 546""") 547 548 f.write(" // Identifier status table:\n") 549 identifier_status_table = load_properties("IdentifierStatus.txt") 550 emit_table(f, "IDENTIFIER_STATUS", identifier_status_table['Allowed'], "&'static [(char, char)]", is_pub=False, 551 pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1]))) 552 identifier_type = load_properties("IdentifierType.txt") 553 type_table = [] 554 for ty in identifier_type: 555 type_table.extend([(x, y, ty) for (x, y) in identifier_type[ty]]) 556 557 type_table.sort(key=lambda w: w[0]) 558 559 emit_table(f, "IDENTIFIER_TYPE", type_table, "&'static [(char, char, IdentifierType)]", is_pub=False, 560 pfun=lambda x: "(%s,%s, IdentifierType::%s)" % (escape_char(x[0]), escape_char(x[1]), x[2])) 561 f.write("}\n\n") 562 563def emit_confusable_detection_module(f): 564 f.write("pub mod confusable_detection {") 565 f.write(""" 566 567 #[inline] 568 pub fn char_confusable_prototype(c: char) -> Option<&'static [char]> { 569 // FIXME: do we want to special case ASCII here? 570 match c as usize { 571 _ => super::util::bsearch_value_table(c, CONFUSABLES) 572 } 573 } 574 575""") 576 577 f.write(" // Confusable table:\n") 578 confusable_table = load_confusables("confusables.txt") 579 confusable_table.sort(key=lambda w: w[0]) 580 581 last_key = None 582 for (k, _) in confusable_table: 583 if k == last_key: 584 raise Exception("duplicate keys in confusables table: %s" % k) 585 last_key = k 586 587 emit_table(f, "CONFUSABLES", confusable_table, "&'static [(char, &'static [char])]", is_pub=False, 588 pfun=lambda x: "(%s, &%s)" % (escape_char(x[0]), escape_char_list(x[1]))) 589 f.write("}\n\n") 590 591def escape_script_constant(name, longforms): 592 return "Script::" + longforms[name].strip() 593 594def emit_potiential_mixed_script_confusable(f): 595 f.write("pub mod potential_mixed_script_confusable {") 596 f.write(""" 597 #[inline] 598 pub fn potential_mixed_script_confusable(c: char) -> bool { 599 match c as usize { 600 _ => super::util::bsearch_table(c, CONFUSABLES) 601 } 602 } 603""") 604 identifier_status_table = load_properties("IdentifierStatus.txt") 605 _, scripts = load_scripts("Scripts.txt") 606 identifier_allowed = identifier_status_table['Allowed'] 607 (mixedscript_confusable, mixedscript_confusable_unresolved) = load_potential_mixedscript_confusables("confusables.txt", identifier_allowed, scripts) 608 debug = False 609 if debug == True: 610 debug_emit_mixedscript_confusable(f, mixedscript_confusable, "mixedscript_confusable", scripts) 611 debug_emit_mixedscript_confusable_unresolved(f, mixedscript_confusable_unresolved, "mixedscript_confusable_unresolved", scripts) 612 confusable_table = [] 613 for script, lst in mixedscript_confusable.items(): 614 for _, pair in lst.items(): 615 source = pair[0] 616 confusable_table.append((source, script)) 617 confusable_table.sort(key=lambda w: w[0]) 618 emit_table(f, "CONFUSABLES", confusable_table, "&'static [char]", is_pub=False, 619 pfun=lambda x: "%s" % escape_char(x[0])) 620 f.write("}\n\n") 621 622 623def emit_util_mod(f): 624 f.write(""" 625pub mod util { 626 use core::result::Result::{Ok, Err}; 627 628 #[inline] 629 pub fn bsearch_table(c: char, r: &'static [char]) -> bool { 630 r.binary_search(&c).is_ok() 631 } 632 633 #[inline] 634 pub fn bsearch_value_table<T: Copy>(c: char, r: &'static [(char, T)]) -> Option<T> { 635 match r.binary_search_by_key(&c, |&(k, _)| k) { 636 Ok(idx) => { 637 let (_, v) = r[idx]; 638 Some(v) 639 } 640 Err(_) => None 641 } 642 } 643 644 #[inline] 645 pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool { 646 use core::cmp::Ordering::{Equal, Less, Greater}; 647 r.binary_search_by(|&(lo,hi)| { 648 if lo <= c && c <= hi { Equal } 649 else if hi < c { Less } 650 else { Greater } 651 }).is_ok() 652 } 653 654 pub fn bsearch_range_value_table<T: Copy>(c: char, r: &'static [(char, char, T)]) -> Option<T> { 655 use core::cmp::Ordering::{Equal, Less, Greater}; 656 match r.binary_search_by(|&(lo, hi, _)| { 657 if lo <= c && c <= hi { Equal } 658 else if hi < c { Less } 659 else { Greater } 660 }) { 661 Ok(idx) => { 662 let (_, _, cat) = r[idx]; 663 Some(cat) 664 } 665 Err(_) => None 666 } 667 } 668 669} 670 671""") 672 673if __name__ == "__main__": 674 r = "tables.rs" 675 if os.path.exists(r): 676 os.remove(r) 677 with open(r, "w") as rf: 678 # write the file's preamble 679 rf.write(preamble) 680 681 rf.write(""" 682/// The version of [Unicode](http://www.unicode.org/) 683/// that this version of unicode-security is based on. 684pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s); 685 686""" % UNICODE_VERSION) 687 688 emit_util_mod(rf) 689 ### identifier module 690 emit_identifier_module(rf) 691 ### confusable_detection module 692 emit_confusable_detection_module(rf) 693 ### mixed_script_confusable_detection module 694 emit_potiential_mixed_script_confusable(rf) 695