1#!/usr/bin/env python 2# Copyright 2014 The Chromium Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE.chromium file. 5 6""" 7A Deterministic acyclic finite state automaton (DAFSA) is a compact 8representation of an unordered word list (dictionary). 9 10https://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton 11 12This python program converts a list of strings to a byte array in C++. 13This python program fetches strings and return values from a gperf file 14and generates a C++ file with a byte array representing graph that can be 15used as a memory efficient replacement for the perfect hash table. 16 17The input strings must consist of printable 7-bit ASCII characters or UTF-8 18multibyte sequences. Control characters in the range [0x00-0x1F] are not 19allowed. The return values must be one digit integers. . 20 21In this program a DAFSA is a diamond shaped graph starting at a common 22source node and ending at a common sink node. All internal nodes contain 23a label and each word is represented by the labels in one path from 24the source node to the sink node. 25 26The following python represention is used for nodes: 27 28 Source node: [ children ] 29 Internal node: (label, [ children ]) 30 Sink node: None 31 32The graph is first compressed by prefixes like a trie. In the next step 33suffixes are compressed so that the graph gets diamond shaped. Finally 34one to one linked nodes are replaced by nodes with the labels joined. 35 36The order of the operations is crucial since lookups will be performed 37starting from the source with no backtracking. Thus a node must have at 38most one child with a label starting by the same character. The output 39is also arranged so that all jumps are to increasing addresses, thus forward 40in memory. 41 42The generated output has suffix free decoding so that the sign of leading 43bits in a link (a reference to a child node) indicate if it has a size of one, 44two or three bytes and if it is the last outgoing link from the actual node. 45A node label is terminated by a byte with the leading bit set. 46 47The generated byte array can described by the following BNF: 48 49<byte> ::= < 8-bit value in range [0x00-0xFF] > 50 51<char> ::= < byte in range [0x1F-0x7F] > 52<end_char> ::= < char + 0x80, byte in range [0x9F-0xFF] > 53<return value> ::= < value + 0x80, byte in range [0x80-0x8F] > 54 55<offset1> ::= < byte in range [0x00-0x3F] > 56<offset2> ::= < byte in range [0x40-0x5F] > 57<offset3> ::= < byte in range [0x60-0x7F] > 58 59<end_offset1> ::= < byte in range [0x80-0xBF] > 60<end_offset2> ::= < byte in range [0xC0-0xDF] > 61<end_offset3> ::= < byte in range [0xE0-0xFF] > 62 63<prefix> ::= <char> 64 65<label> ::= <end_char> 66 | <char> <label> 67 68<end_label> ::= <return_value> 69 | <char> <end_label> 70 71<offset> ::= <offset1> 72 | <offset2> <byte> 73 | <offset3> <byte> <byte> 74 75<end_offset> ::= <end_offset1> 76 | <end_offset2> <byte> 77 | <end_offset3> <byte> <byte> 78 79<offsets> ::= <end_offset> 80 | <offset> <offsets> 81 82<source> ::= <offsets> 83 84<node> ::= <label> <offsets> 85 | <prefix> <node> 86 | <end_label> 87 88<graph> ::= <graph> 89 | <graph> <node> 90 91<version> ::= <empty> # The DAFSA was generated in ASCII mode. 92 | < byte value 0x01 > # The DAFSA was generated in UTF-8 mode. 93 94<dafsa> ::= <graph> <version> 95 96Decoding: 97 98<char> -> character 99<end_char> & 0x7F -> character 100<return value> & 0x0F -> integer 101<offset1 & 0x3F> -> integer 102((<offset2> & 0x1F>) << 8) + <byte> -> integer 103((<offset3> & 0x1F>) << 16) + (<byte> << 8) + <byte> -> integer 104 105end_offset1, end_offset2 and and_offset3 are decoded same as offset1, 106offset2 and offset3 respectively. 107 108The first offset in a list of offsets is the distance in bytes between the 109offset itself and the first child node. Subsequent offsets are the distance 110between previous child node and next child node. Thus each offset links a node 111to a child node. The distance is always counted between start addresses, i.e. 112first byte in decoded offset or first byte in child node. 113 114Transcoding of UTF-8 multibyte sequences: 115 116The original DAFSA format was limited to 7-bit printable ASCII characters in 117range [0x20-0xFF], but has been extended to allow UTF-8 multibyte sequences. 118By transcoding of such characters the new format preserves compatibility with 119old parsers, so that a DAFSA in the extended format can be used by an old 120parser without false positives, although strings containing transcoded 121characters will never match. Since the format is extended rather than being 122changed, a parser supporting the new format will automatically support data 123generated in the old format. 124 125Transcoding is performed by insertion of a start byte with the special value 1260x1F, followed by 2-4 bytes shifted into the range [0x40-0x7F], thus inside 127the range of printable ASCII. 128 1292-byte: 110nnnnn, 10nnnnnn -> 00011111, 010nnnnn, 01nnnnnn 130 1313-byte: 1110nnnn, 10nnnnnn, 10nnnnnn -> 00011111, 0110nnnn, 01nnnnnn, 01nnnnnn 132 1334-byte: 11110nnn, 10nnnnnn, 10nnnnnn, 10nnnnnn -> 134 00011111, 01110nnn, 01nnnnnn, 01nnnnnn, 01nnnnnn 135 136Example 1: 137 138%% 139aa, 1 140a, 2 141%% 142 143The input is first parsed to a list of words: 144["aa1", "a2"] 145 146A fully expanded graph is created from the words: 147source = [node1, node4] 148node1 = ("a", [node2]) 149node2 = ("a", [node3]) 150node3 = ("\x01", [sink]) 151node4 = ("a", [node5]) 152node5 = ("\x02", [sink]) 153sink = None 154 155Compression results in the following graph: 156source = [node1] 157node1 = ("a", [node2, node3]) 158node2 = ("\x02", [sink]) 159node3 = ("a\x01", [sink]) 160sink = None 161 162A C++ representation of the compressed graph is generated: 163 164const unsigned char dafsa[7] = { 165 0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81, 166}; 167 168The bytes in the generated array has the following meaning: 169 170 0: 0x81 <end_offset1> child at position 0 + (0x81 & 0x3F) -> jump to 1 171 172 1: 0xE1 <end_char> label character (0xE1 & 0x7F) -> match "a" 173 2: 0x02 <offset1> child at position 2 + (0x02 & 0x3F) -> jump to 4 174 175 3: 0x81 <end_offset1> child at position 4 + (0x81 & 0x3F) -> jump to 5 176 4: 0x82 <return_value> 0x82 & 0x0F -> return 2 177 178 5: 0x61 <char> label character 0x61 -> match "a" 179 6: 0x81 <return_value> 0x81 & 0x0F -> return 1 180 181Example 2: 182 183%% 184aa, 1 185bbb, 2 186baa, 1 187%% 188 189The input is first parsed to a list of words: 190["aa1", "bbb2", "baa1"] 191 192Compression results in the following graph: 193source = [node1, node2] 194node1 = ("b", [node2, node3]) 195node2 = ("aa\x01", [sink]) 196node3 = ("bb\x02", [sink]) 197sink = None 198 199A C++ representation of the compressed graph is generated: 200 201const unsigned char dafsa[11] = { 202 0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62, 0x82, 203}; 204 205The bytes in the generated array has the following meaning: 206 207 0: 0x02 <offset1> child at position 0 + (0x02 & 0x3F) -> jump to 2 208 1: 0x83 <end_offset1> child at position 2 + (0x83 & 0x3F) -> jump to 5 209 210 2: 0xE2 <end_char> label character (0xE2 & 0x7F) -> match "b" 211 3: 0x02 <offset1> child at position 3 + (0x02 & 0x3F) -> jump to 5 212 4: 0x83 <end_offset1> child at position 5 + (0x83 & 0x3F) -> jump to 8 213 214 5: 0x61 <char> label character 0x61 -> match "a" 215 6: 0x61 <char> label character 0x61 -> match "a" 216 7: 0x81 <return_value> 0x81 & 0x0F -> return 1 217 218 8: 0x62 <char> label character 0x62 -> match "b" 219 9: 0x62 <char> label character 0x62 -> match "b" 22010: 0x82 <return_value> 0x82 & 0x0F -> return 2 221""" 222 223import sys 224import os.path 225import time 226import hashlib 227import json 228 229class InputError(Exception): 230 """Exception raised for errors in the input file.""" 231 232# Length of a character starting at a given byte. 233char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x00-0x0F 234 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x10-0x1F 235 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x20-0x2F 236 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x30-x03F 237 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x40-0x4F 238 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x50-x05F 239 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x60-0x6F 240 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x70-x07F 241 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x80-0x8F 242 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x90-0x9F 243 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xA0-0xAF 244 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xB0-0xBF 245 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # 0xC0-0xCF 246 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # 0xD0-0xDF 247 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # 0xE0-0xEF 248 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF 249 250def to_bytes(n): 251 """Converts an integer value to a bytes object.""" 252 return bytes(bytearray((n,))) 253 254def to_dafsa(words, utf_mode): 255 """Generates a DAFSA from a word list and returns the source node. 256 257 Each word is split into characters so that each character is represented by 258 a unique node. It is assumed the word list is not empty. 259 """ 260 if not words: 261 raise InputError('The domain list must not be empty') 262 def to_nodes(word, multibyte_length): 263 """Split words into characters""" 264 byte = ord(word[:1]) 265 if multibyte_length: 266 # Consume next byte in multibyte sequence. 267 if byte & 0xC0 != 0x80: 268 raise InputError('Invalid UTF-8 multibyte sequence') 269 return to_bytes(byte ^ 0xC0), [to_nodes(word[1:], multibyte_length - 1)] 270 char_length = char_length_table[byte] 271 if char_length == 1: 272 # 7-bit printable ASCII. 273 if len(word) == 1: 274 return to_bytes(int(word[:1], 16) & 0x0F), [None] 275 return word[:1], [to_nodes(word[1:], 0)] 276 elif char_length > 1: 277 # Leading byte in multibyte sequence. 278 if not utf_mode: 279 raise InputError('UTF-8 encoded characters are not allowed in ASCII mode') 280 if len(word) <= char_length: 281 raise InputError('Unterminated UTF-8 multibyte sequence') 282 return to_bytes(0x1F), [(to_bytes(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])] 283 # Unexpected character. 284 raise InputError('Domain names must be printable ASCII or UTF-8') 285 286 return [to_nodes(word, 0) for word in words] 287 288def to_words(node): 289 """Generates a word list from all paths starting from an internal node.""" 290 if not node: 291 return [b''] 292 return [(node[0] + word) for child in node[1] for word in to_words(child)] 293 294 295def reverse(dafsa): 296 """Generates a new DAFSA that is reversed, so that the old sink node becomes 297 the new source node. 298 """ 299 sink = [] 300 nodemap = {} 301 302 def dfs(node, parent): 303 """Creates reverse nodes. 304 305 A new reverse node will be created for each old node. The new node will 306 get a reversed label and the parents of the old node as children. 307 """ 308 if not node: 309 sink.append(parent) 310 elif id(node) not in nodemap: 311 nodemap[id(node)] = (node[0][::-1], [parent]) 312 for child in node[1]: 313 dfs(child, nodemap[id(node)]) 314 else: 315 nodemap[id(node)][1].append(parent) 316 317 for node in dafsa: 318 dfs(node, None) 319 return sink 320 321 322def join_labels(dafsa): 323 """Generates a new DAFSA where internal nodes are merged if there is a one to 324 one connection. 325 """ 326 parentcount = {id(None): 2} 327 nodemap = {id(None): None} 328 329 def count_parents(node): 330 """Count incoming references""" 331 if id(node) in parentcount: 332 parentcount[id(node)] += 1 333 else: 334 parentcount[id(node)] = 1 335 for child in node[1]: 336 count_parents(child) 337 338 def join(node): 339 """Create new nodes""" 340 if id(node) not in nodemap: 341 children = [join(child) for child in node[1]] 342 if len(children) == 1 and parentcount[id(node[1][0])] == 1: 343 child = children[0] 344 nodemap[id(node)] = (node[0] + child[0], child[1]) 345 else: 346 nodemap[id(node)] = (node[0], children) 347 return nodemap[id(node)] 348 349 for node in dafsa: 350 count_parents(node) 351 return [join(node) for node in dafsa] 352 353 354def join_suffixes(dafsa): 355 """Generates a new DAFSA where nodes that represent the same word lists 356 towards the sink are merged. 357 """ 358 nodemap = {frozenset((b'',)): None} 359 360 def join(node): 361 """Returns a macthing node. A new node is created if no matching node 362 exists. The graph is accessed in dfs order. 363 """ 364 suffixes = frozenset(to_words(node)) 365 if suffixes not in nodemap: 366 nodemap[suffixes] = (node[0], [join(child) for child in node[1]]) 367 return nodemap[suffixes] 368 369 return [join(node) for node in dafsa] 370 371 372def top_sort(dafsa): 373 """Generates list of nodes in topological sort order.""" 374 incoming = {} 375 376 def count_incoming(node): 377 """Counts incoming references.""" 378 if node: 379 if id(node) not in incoming: 380 incoming[id(node)] = 1 381 for child in node[1]: 382 count_incoming(child) 383 else: 384 incoming[id(node)] += 1 385 386 for node in dafsa: 387 count_incoming(node) 388 389 for node in dafsa: 390 incoming[id(node)] -= 1 391 392 waiting = [node for node in dafsa if incoming[id(node)] == 0] 393 nodes = [] 394 395 while waiting: 396 node = waiting.pop() 397 assert incoming[id(node)] == 0 398 nodes.append(node) 399 for child in node[1]: 400 if child: 401 incoming[id(child)] -= 1 402 if incoming[id(child)] == 0: 403 waiting.append(child) 404 return nodes 405 406 407def encode_links(children, offsets, current): 408 """Encodes a list of children as one, two or three byte offsets.""" 409 if not children[0]: 410 # This is an <end_label> node and no links follow such nodes 411 assert len(children) == 1 412 return [] 413 guess = 3 * len(children) 414 assert children 415 children = sorted(children, key=lambda x: -offsets[id(x)]) 416 while True: 417 offset = current + guess 418 buf = [] 419 for child in children: 420 last = len(buf) 421 distance = offset - offsets[id(child)] 422 assert distance > 0 and distance < (1 << 21) 423 424 if distance < (1 << 6): 425 # A 6-bit offset: "s0xxxxxx" 426 buf.append(distance) 427 elif distance < (1 << 13): 428 # A 13-bit offset: "s10xxxxxxxxxxxxx" 429 buf.append(0x40 | (distance >> 8)) 430 buf.append(distance & 0xFF) 431 else: 432 # A 21-bit offset: "s11xxxxxxxxxxxxxxxxxxxxx" 433 buf.append(0x60 | (distance >> 16)) 434 buf.append((distance >> 8) & 0xFF) 435 buf.append(distance & 0xFF) 436 # Distance in first link is relative to following record. 437 # Distance in other links are relative to previous link. 438 offset -= distance 439 if len(buf) == guess: 440 break 441 guess = len(buf) 442 # Set most significant bit to mark end of links in this node. 443 buf[last] |= (1 << 7) 444 buf.reverse() 445 return buf 446 447 448def encode_prefix(label): 449 """Encodes a node label as a list of bytes without a trailing high byte. 450 451 This method encodes a node if there is exactly one child and the 452 child follows immediately after so that no jump is needed. This label 453 will then be a prefix to the label in the child node. 454 """ 455 assert label 456 return [c for c in bytearray(reversed(label))] 457 458 459def encode_label(label): 460 """Encodes a node label as a list of bytes with a trailing high byte >0x80. 461 """ 462 buf = encode_prefix(label) 463 # Set most significant bit to mark end of label in this node. 464 buf[0] |= (1 << 7) 465 return buf 466 467 468def encode(dafsa, utf_mode): 469 """Encodes a DAFSA to a list of bytes""" 470 output = [] 471 offsets = {} 472 473 for node in reversed(top_sort(dafsa)): 474 if (len(node[1]) == 1 and node[1][0] and 475 (offsets[id(node[1][0])] == len(output))): 476 output.extend(encode_prefix(node[0])) 477 else: 478 output.extend(encode_links(node[1], offsets, len(output))) 479 output.extend(encode_label(node[0])) 480 offsets[id(node)] = len(output) 481 482 output.extend(encode_links(dafsa, offsets, len(output))) 483 output.reverse() 484 if utf_mode: 485 output.append(0x01) 486 return output 487 488 489def to_cxx(data, codecs): 490 """Generates C++ code from a list of encoded bytes.""" 491 text = b'/* This file has been generated by hsts-make-dafsa. DO NOT EDIT!\n\n' 492 text += b'The byte array encodes effective tld names. See hsts-make-dafsa source for' 493 text += b' documentation.' 494 text += b'*/\n\n' 495 text += b'static const unsigned char kDafsa[' 496 text += bytes(str(len(data)), **codecs) 497 text += b'] = {\n' 498 for i in range(0, len(data), 12): 499 text += b' ' 500 text += bytes(', '.join('0x%02x' % byte for byte in data[i:i + 12]), **codecs) 501 text += b',\n' 502 text += b'};\n' 503 return text 504 505def sha1_file(name): 506 sha1 = hashlib.sha1() 507 with open(name, 'rb') as f: 508 while True: 509 data = f.read(65536) 510 if not data: 511 break 512 sha1.update(data) 513 return sha1.hexdigest() 514 515def to_cxx_plus(data, codecs): 516 """Generates C++ code from a word list plus some variable assignments as needed by libhsts""" 517 text = to_cxx(data, codecs) 518 text += b'static time_t _hsts_file_time = %d;\n' % os.stat(hsts_input_file).st_mtime 519 text += b'static int _hsts_ndomains = %d;\n' % hsts_ndomains 520 text += b'static const char _hsts_sha1_checksum[] = "%s";\n' % bytes(sha1_file(hsts_input_file), **codecs) 521 text += b'static const char _hsts_filename[] = "%s";\n' % bytes(hsts_input_file, **codecs) 522 return text 523 524def words_to_whatever(words, converter, utf_mode, codecs): 525 """Generates C++ code from a word list""" 526 dafsa = to_dafsa(words, utf_mode) 527 for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels): 528 dafsa = fun(dafsa) 529 return converter(encode(dafsa, utf_mode), codecs) 530 531 532def words_to_cxx(words, utf_mode, codecs): 533 """Generates C++ code from a word list""" 534 return words_to_whatever(words, to_cxx, utf_mode, codecs) 535 536def words_to_cxx_plus(words, utf_mode, codecs): 537 """Generates C++ code from a word list plus some variable assignments as needed by libhsts""" 538 return words_to_whatever(words, to_cxx_plus, utf_mode, codecs) 539 540def words_to_binary(words, utf_mode, codecs): 541 """Generates C++ code from a word list""" 542 return b'.DAFSA@HSTS_0 \n' + words_to_whatever(words, lambda x, _: bytearray(x), utf_mode, codecs) 543 544 545def parse_hsts(infile, utf_mode, codecs): 546 """Parses HSTS file and extract strings and return code""" 547 HSTS_FLAG_INCLUDE_SUBDIRS = (1<<0) 548 549 global hsts_nsuffixes 550 551 data = json.load(infile)["entries"] 552 553 hsts = {} 554 nentries = 0; 555 556 for entry in data: 557 flags = 0; 558 559 if "include_subdomains" in entry: 560 if entry['include_subdomains'] == True: 561 flags = HSTS_FLAG_INCLUDE_SUBDIRS; 562 563 domain = bytes(entry['name'].strip(), **codecs) 564# utf8 = bytes(domain.decode('idna').encode('utf-8')) 565 566# if utf8 in hsts: 567# print('Found %s/%X (now %X)' % utf8, hsts[utf8], flags) 568# continue 569 570# if utf_mode and domain != utf8: 571# print('%s %s %X' % (domain, utf8, flags)) 572# hsts[utf8] = flags 573 574 hsts[domain] = flags 575 nentries += 1; 576 577 return [domain + bytes('%X' % (flags & 0x0F), **codecs) for (domain, flags) in sorted(hsts.items())] 578 579 580def usage(): 581 """Prints the usage""" 582 print('usage: %s [options] infile outfile' % sys.argv[0]) 583 print(' --output-format=cxx Write DAFSA as C/C++ code (default)') 584 print(' --output-format=cxx+ Write DAFSA as C/C++ code plus statistical assignments') 585 print(' --output-format=binary Write DAFSA binary data') 586 print(' --encoding=ascii 7-bit ASCII mode') 587 print(' --encoding=utf-8 UTF-8 mode (default)') 588 exit(1) 589 590 591def main(): 592 """Convert HSTS file into C or binary DAFSA file""" 593 if len(sys.argv) < 3: 594 usage() 595 596 converter = words_to_cxx 597 parser = parse_hsts 598 utf_mode = True 599 600 codecs = dict() 601 if sys.version_info.major > 2: 602 codecs['encoding'] = 'utf-8' 603 604 for arg in sys.argv[1:-2]: 605 if arg.startswith('--output-format='): 606 value = arg[16:].lower() 607 if value == 'binary': 608 converter = words_to_binary 609 elif value == 'cxx': 610 converter = words_to_cxx 611 elif value == 'cxx+': 612 converter = words_to_cxx_plus 613 else: 614 print("Unknown output format '%s'" % value) 615 return 1 616 elif arg.startswith('--encoding='): 617 value = arg[11:].lower() 618 if value == 'ascii': 619 utf_mode = False 620 elif value == 'utf-8': 621 utf_mode = True 622 else: 623 print("Unknown encoding '%s'" % value) 624 return 1 625 else: 626 usage() 627 628 if sys.argv[-2] == '-': 629 with open(sys.argv[-1], 'wb') as outfile: 630 outfile.write(converter(parser(sys.stdin, utf_mode, codecs), utf_mode, codecs)) 631 else: 632 """Some statistical data for --output-format=cxx+""" 633 global hsts_input_file, hsts_nsuffixes 634 635 hsts_input_file = sys.argv[-2] 636 hsts_nsuffixes = 0 637 638 with open(sys.argv[-2], 'r', **codecs) as infile, open(sys.argv[-1], 'wb') as outfile: 639 outfile.write(converter(parser(infile, utf_mode, codecs), utf_mode, codecs)) 640 641 return 0 642 643 644if __name__ == '__main__': 645 sys.exit(main()) 646