1# numdb.py - module for handling hierarchically organised numbers
2#
3# Copyright (C) 2010-2019 Arthur de Jong
4#
5# This library is free software; you can redistribute it and/or
6# modify it under the terms of the GNU Lesser General Public
7# License as published by the Free Software Foundation; either
8# version 2.1 of the License, or (at your option) any later version.
9#
10# This library is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13# Lesser General Public License for more details.
14#
15# You should have received a copy of the GNU Lesser General Public
16# License along with this library; if not, write to the Free Software
17# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
18# 02110-1301 USA
19
20"""Query structured number format files with number properties.
21
22This module contains functions for reading and querying a database that
23stores numbers that use a hierarchical format (e.g. ISBN, IBAN, phone
24numbers, etc).
25
26To read a database from a file:
27
28>>> with open('tests/numdb-test.dat', 'r') as f:
29...     dbfile = read(f)
30
31To split a number:
32
33>>> dbfile.split('01006')
34['0', '100', '6']
35>>> dbfile.split('902006')
36['90', '20', '06']
37>>> dbfile.split('909856')
38['90', '985', '6']
39
40To split the number and get properties for each part:
41
42>>> dbfile.info('01006') == [
43...     ('0',   {'prop1': 'foo'}),
44...     ('100', {'prop2': 'bar'}),
45...     ('6',   {}),
46... ]
47True
48>>> dbfile.info('02006') == [
49...     ('0',   {'prop1': 'foo'}),
50...     ('200', {'prop2': 'bar', 'prop3': 'baz'}),
51...     ('6',   {}),
52... ]
53True
54>>> dbfile.info('03456') == [
55...     ('0', {'prop1': 'foo'}),
56...     ('345', {'prop2': 'bar', 'prop3': 'baz'}),
57...     ('6', {}),
58... ]
59True
60>>> dbfile.info('902006') == [
61...     ('90', {'prop1': 'booz'}),
62...     ('20', {'prop2': 'foo'}),
63...     ('06', {}),
64... ]
65True
66>>> dbfile.info('909856') == [
67...     ('90', {'prop1': 'booz'}),
68...     ('985', {'prop2': 'fooz'}),
69...     ('6', {}),
70... ]
71True
72>>> dbfile.info('9889') == [
73...     ('98', {'prop1': 'booz'}),
74...     ('89', {'prop2': 'foo'}),
75... ]
76True
77>>> dbfile.info('633322') == [
78...     ('6', {'prop1': 'boo'}),
79...     ('333', {'prop2': 'bar', 'prop3': 'baz', 'prop4': 'bla'}),
80...     ('22', {}),
81... ]
82True
83
84"""
85
86import re
87
88from pkg_resources import resource_stream
89
90
91_line_re = re.compile(
92    r'^(?P<indent> *)'
93    r'(?P<ranges>([^-,\s]+(-[^-,\s]+)?)(,[^-,\s]+(-[^-,\s]+)?)*)\s*'
94    r'(?P<props>.*)$')
95_prop_re = re.compile(
96    r'(?P<prop>[0-9a-zA-Z-_]+)="(?P<value>[^"]*)"')
97
98# this is a cache of open databases
99_open_databases = {}
100
101# the prefixes attribute of NumDB is structured as follows:
102# prefixes = [
103#   [ length, low, high, props, children ]
104#   ...
105# ]
106# where children is a prefixes structure in its own right
107# (there is no expected ordering within the list)
108
109
110class NumDB():
111    """Number database."""
112
113    def __init__(self):
114        """Construct an empty database."""
115        self.prefixes = []
116
117    @staticmethod
118    def _merge(results):
119        """Merge the provided list of possible results into a single result
120        list (this is a generator)."""
121        # expand the results to all have the same length
122        ml = max(len(x) for x in results)
123        results = [x + (ml - len(x)) * [None]
124                   for x in results]
125        # go over each part
126        for parts in zip(*results):
127            # regroup parts into parts list and properties list
128            partlist, proplist = list(zip(*(x for x in parts if x)))
129            part = min(partlist, key=len)
130            props = {}
131            for p in proplist:
132                props.update(p)
133            yield part, props
134
135    @staticmethod
136    def _find(number, prefixes):
137        """Lookup the specified number in the list of prefixes, this will
138        return basically what info() should return but works recursively."""
139        if not number:
140            return []
141        results = []
142        if prefixes:
143            for length, low, high, props, children in prefixes:
144                if low <= number[:length] <= high and len(number) >= length:
145                    results.append([(number[:length], props)] +
146                                   NumDB._find(number[length:], children))
147        # not-found fallback
148        if not results:
149            return [(number, {})]
150        # merge the results into a single result
151        return list(NumDB._merge(results))
152
153    def info(self, number):
154        """Split the provided number in components and associate properties
155        with each component. This returns a tuple of tuples. Each tuple
156        consists of a string (a part of the number) and a dict of properties.
157        """
158        return NumDB._find(number, self.prefixes)
159
160    def split(self, number):
161        """Split the provided number in components. This returns a tuple with
162        the number of components identified."""
163        return [part for part, props in self.info(number)]
164
165
166def _parse(fp):
167    """Read lines of text from the file pointer and generate indent, length,
168    low, high, properties tuples."""
169    for line in fp:
170        # ignore comments
171        if line[0] == '#' or line.strip() == '':
172            continue  # pragma: no cover (optimisation takes it out)
173        # any other line should parse
174        match = _line_re.search(line)
175        indent = len(match.group('indent'))
176        ranges = match.group('ranges')
177        props = dict(_prop_re.findall(match.group('props')))
178        for rnge in ranges.split(','):
179            if '-' in rnge:
180                low, high = rnge.split('-')
181            else:
182                low, high = rnge, rnge
183            yield indent, len(low), low, high, props
184
185
186def read(fp):
187    """Return a new database with the data read from the specified file."""
188    last_indent = 0
189    db = NumDB()
190    stack = {0: db.prefixes}
191    for indent, length, low, high, props in _parse(fp):
192        if indent > last_indent:
193            # populate the children field of the last indent
194            stack[last_indent][-1][4] = []
195            stack[indent] = stack[last_indent][-1][4]
196        stack[indent].append([length, low, high, props, None])
197        last_indent = indent
198    return db
199
200
201def get(name):
202    """Open a database with the specified name to perform queries on."""
203    if name not in _open_databases:
204        import codecs
205        reader = codecs.getreader('utf-8')
206        with reader(resource_stream(__name__, name + '.dat')) as fp:
207            _open_databases[name] = read(fp)
208    return _open_databases[name]
209