1# numdb.py - module for handling hierarchically organised numbers 2# 3# Copyright (C) 2010-2019 Arthur de Jong 4# 5# This library is free software; you can redistribute it and/or 6# modify it under the terms of the GNU Lesser General Public 7# License as published by the Free Software Foundation; either 8# version 2.1 of the License, or (at your option) any later version. 9# 10# This library is distributed in the hope that it will be useful, 11# but WITHOUT ANY WARRANTY; without even the implied warranty of 12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13# Lesser General Public License for more details. 14# 15# You should have received a copy of the GNU Lesser General Public 16# License along with this library; if not, write to the Free Software 17# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 18# 02110-1301 USA 19 20"""Query structured number format files with number properties. 21 22This module contains functions for reading and querying a database that 23stores numbers that use a hierarchical format (e.g. ISBN, IBAN, phone 24numbers, etc). 25 26To read a database from a file: 27 28>>> with open('tests/numdb-test.dat', 'r') as f: 29... dbfile = read(f) 30 31To split a number: 32 33>>> dbfile.split('01006') 34['0', '100', '6'] 35>>> dbfile.split('902006') 36['90', '20', '06'] 37>>> dbfile.split('909856') 38['90', '985', '6'] 39 40To split the number and get properties for each part: 41 42>>> dbfile.info('01006') == [ 43... ('0', {'prop1': 'foo'}), 44... ('100', {'prop2': 'bar'}), 45... ('6', {}), 46... ] 47True 48>>> dbfile.info('02006') == [ 49... ('0', {'prop1': 'foo'}), 50... ('200', {'prop2': 'bar', 'prop3': 'baz'}), 51... ('6', {}), 52... ] 53True 54>>> dbfile.info('03456') == [ 55... ('0', {'prop1': 'foo'}), 56... ('345', {'prop2': 'bar', 'prop3': 'baz'}), 57... ('6', {}), 58... ] 59True 60>>> dbfile.info('902006') == [ 61... ('90', {'prop1': 'booz'}), 62... ('20', {'prop2': 'foo'}), 63... ('06', {}), 64... ] 65True 66>>> dbfile.info('909856') == [ 67... ('90', {'prop1': 'booz'}), 68... ('985', {'prop2': 'fooz'}), 69... ('6', {}), 70... ] 71True 72>>> dbfile.info('9889') == [ 73... ('98', {'prop1': 'booz'}), 74... ('89', {'prop2': 'foo'}), 75... ] 76True 77>>> dbfile.info('633322') == [ 78... ('6', {'prop1': 'boo'}), 79... ('333', {'prop2': 'bar', 'prop3': 'baz', 'prop4': 'bla'}), 80... ('22', {}), 81... ] 82True 83 84""" 85 86import re 87 88from pkg_resources import resource_stream 89 90 91_line_re = re.compile( 92 r'^(?P<indent> *)' 93 r'(?P<ranges>([^-,\s]+(-[^-,\s]+)?)(,[^-,\s]+(-[^-,\s]+)?)*)\s*' 94 r'(?P<props>.*)$') 95_prop_re = re.compile( 96 r'(?P<prop>[0-9a-zA-Z-_]+)="(?P<value>[^"]*)"') 97 98# this is a cache of open databases 99_open_databases = {} 100 101# the prefixes attribute of NumDB is structured as follows: 102# prefixes = [ 103# [ length, low, high, props, children ] 104# ... 105# ] 106# where children is a prefixes structure in its own right 107# (there is no expected ordering within the list) 108 109 110class NumDB(): 111 """Number database.""" 112 113 def __init__(self): 114 """Construct an empty database.""" 115 self.prefixes = [] 116 117 @staticmethod 118 def _merge(results): 119 """Merge the provided list of possible results into a single result 120 list (this is a generator).""" 121 # expand the results to all have the same length 122 ml = max(len(x) for x in results) 123 results = [x + (ml - len(x)) * [None] 124 for x in results] 125 # go over each part 126 for parts in zip(*results): 127 # regroup parts into parts list and properties list 128 partlist, proplist = list(zip(*(x for x in parts if x))) 129 part = min(partlist, key=len) 130 props = {} 131 for p in proplist: 132 props.update(p) 133 yield part, props 134 135 @staticmethod 136 def _find(number, prefixes): 137 """Lookup the specified number in the list of prefixes, this will 138 return basically what info() should return but works recursively.""" 139 if not number: 140 return [] 141 results = [] 142 if prefixes: 143 for length, low, high, props, children in prefixes: 144 if low <= number[:length] <= high and len(number) >= length: 145 results.append([(number[:length], props)] + 146 NumDB._find(number[length:], children)) 147 # not-found fallback 148 if not results: 149 return [(number, {})] 150 # merge the results into a single result 151 return list(NumDB._merge(results)) 152 153 def info(self, number): 154 """Split the provided number in components and associate properties 155 with each component. This returns a tuple of tuples. Each tuple 156 consists of a string (a part of the number) and a dict of properties. 157 """ 158 return NumDB._find(number, self.prefixes) 159 160 def split(self, number): 161 """Split the provided number in components. This returns a tuple with 162 the number of components identified.""" 163 return [part for part, props in self.info(number)] 164 165 166def _parse(fp): 167 """Read lines of text from the file pointer and generate indent, length, 168 low, high, properties tuples.""" 169 for line in fp: 170 # ignore comments 171 if line[0] == '#' or line.strip() == '': 172 continue # pragma: no cover (optimisation takes it out) 173 # any other line should parse 174 match = _line_re.search(line) 175 indent = len(match.group('indent')) 176 ranges = match.group('ranges') 177 props = dict(_prop_re.findall(match.group('props'))) 178 for rnge in ranges.split(','): 179 if '-' in rnge: 180 low, high = rnge.split('-') 181 else: 182 low, high = rnge, rnge 183 yield indent, len(low), low, high, props 184 185 186def read(fp): 187 """Return a new database with the data read from the specified file.""" 188 last_indent = 0 189 db = NumDB() 190 stack = {0: db.prefixes} 191 for indent, length, low, high, props in _parse(fp): 192 if indent > last_indent: 193 # populate the children field of the last indent 194 stack[last_indent][-1][4] = [] 195 stack[indent] = stack[last_indent][-1][4] 196 stack[indent].append([length, low, high, props, None]) 197 last_indent = indent 198 return db 199 200 201def get(name): 202 """Open a database with the specified name to perform queries on.""" 203 if name not in _open_databases: 204 import codecs 205 reader = codecs.getreader('utf-8') 206 with reader(resource_stream(__name__, name + '.dat')) as fp: 207 _open_databases[name] = read(fp) 208 return _open_databases[name] 209