1'''
2geodata.osm.extract
3-------------------
4
5Extracts nodes/ways/relations, their metadata and dependencies
6from .osm XML files.
7'''
8
9import re
10import six
11import urllib
12import HTMLParser
13
14from collections import OrderedDict
15from lxml import etree
16
17
18from geodata.csv_utils import unicode_csv_reader
19from geodata.text.normalize import normalize_string, NORMALIZE_STRING_DECOMPOSE, NORMALIZE_STRING_LATIN_ASCII
20from geodata.encoding import safe_decode, safe_encode
21
22
23WAY_OFFSET = 10 ** 15
24RELATION_OFFSET = 2 * 10 ** 15
25
26NODE = 'node'
27WAY = 'way'
28RELATION = 'relation'
29
30ALL_OSM_TAGS = set([NODE, WAY, RELATION])
31WAYS_RELATIONS = set([WAY, RELATION])
32
33OSM_NAME_TAGS = (
34    'name',
35    'alt_name',
36    'int_name',
37    'nat_name',
38    'reg_name',
39    'loc_name',
40    'official_name',
41    'commonname',
42    'common_name',
43    'place_name',
44    'short_name',
45)
46
47OSM_BASE_NAME_TAGS = (
48    'tiger:name_base',
49)
50
51
52def parse_osm(filename, allowed_types=ALL_OSM_TAGS, dependencies=False):
53    '''
54    Parse a file in .osm format iteratively, generating tuples like:
55    ('node:1', OrderedDict([('lat', '12.34'), ('lon', '23.45')])),
56    ('node:2', OrderedDict([('lat', '12.34'), ('lon', '23.45')])),
57    ('node:3', OrderedDict([('lat', '12.34'), ('lon', '23.45')])),
58    ('node:4', OrderedDict([('lat', '12.34'), ('lon', '23.45')])),
59    ('way:4444', OrderedDict([('name', 'Main Street')]), [1,2,3,4])
60    '''
61    f = open(filename)
62    parser = etree.iterparse(f)
63
64    single_type = len(allowed_types) == 1
65
66    for (_, elem) in parser:
67        elem_id = long(elem.attrib.pop('id', 0))
68        item_type = elem.tag
69        if elem_id >= WAY_OFFSET and elem_id < RELATION_OFFSET:
70            elem_id -= WAY_OFFSET
71            item_type = 'way'
72        elif elem_id >= RELATION_OFFSET:
73            elem_id -= RELATION_OFFSET
74            item_type = 'relation'
75
76        if item_type in allowed_types:
77            attrs = OrderedDict(elem.attrib)
78            attrs['type'] = item_type
79            attrs['id'] = safe_encode(elem_id)
80
81            top_level_attrs = set(attrs)
82            deps = [] if dependencies else None
83
84            for e in elem.getchildren():
85                if e.tag == 'tag':
86                    # Prevent user-defined lat/lon keys from overriding the lat/lon on the node
87                    key = e.attrib['k']
88                    if key not in top_level_attrs:
89                        attrs[key] = e.attrib['v']
90                elif dependencies and item_type == 'way' and e.tag == 'nd':
91                    deps.append(long(e.attrib['ref']))
92                elif dependencies and item_type == 'relation' and e.tag == 'member' and 'role' in e.attrib:
93                    deps.append((long(e.attrib['ref']), e.attrib.get('type'), e.attrib['role']))
94
95            key = elem_id if single_type else '{}:{}'.format(item_type, elem_id)
96            yield key, attrs, deps
97
98        if elem.tag in ALL_OSM_TAGS:
99            elem.clear()
100            while elem.getprevious() is not None:
101                del elem.getparent()[0]
102
103
104def osm_type_and_id(element_id):
105    element_id = long(element_id)
106    if element_id >= RELATION_OFFSET:
107        id_type = RELATION
108        element_id -= RELATION_OFFSET
109    elif element_id >= WAY_OFFSET:
110        id_type = WAY
111        element_id -= WAY_OFFSET
112    else:
113        id_type = NODE
114
115    return id_type, element_id
116
117apposition_regex = re.compile('(.*[^\s])[\s]*\([\s]*(.*[^\s])[\s]*\)$', re.I)
118
119html_parser = HTMLParser.HTMLParser()
120
121
122def normalize_wikipedia_title(title):
123    match = apposition_regex.match(title)
124    if match:
125        title = match.group(1)
126
127    title = safe_decode(title)
128    title = html_parser.unescape(title)
129    title = urllib.unquote_plus(title)
130
131    return title.replace(u'_', u' ').strip()
132
133
134def osm_wikipedia_title_and_language(key, value):
135    language = None
136    if u':' in key:
137        key, language = key.rsplit(u':', 1)
138
139    if u':' in value:
140        possible_language = value.split(u':', 1)[0]
141        if len(possible_language) == 2 and language is None:
142            language = possible_language
143            value = value.rsplit(u':', 1)[-1]
144
145    return normalize_wikipedia_title(value), language
146
147
148non_breaking_dash = six.u('[-\u058a\u05be\u1400\u1806\u2010-\u2013\u2212\u2e17\u2e1a\ufe32\ufe63\uff0d]')
149simple_number = six.u('(?:{})?[0-9]+(?:\.[0-9]+)?').format(non_breaking_dash)
150simple_number_regex = re.compile(simple_number, re.UNICODE)
151
152non_breaking_dash_regex = re.compile(non_breaking_dash, re.UNICODE)
153number_range_regex = re.compile(six.u('({}){}({})').format(simple_number, non_breaking_dash, simple_number), re.UNICODE)
154letter_range_regex = re.compile(r'([^\W\d_]){}([^\W\d_])'.format(non_breaking_dash.encode('unicode-escape')), re.UNICODE)
155
156number_split_regex = re.compile('[,;]')
157
158
159def parse_osm_number_range(value, parse_letter_range=True, max_range=100):
160    value = normalize_string(value, string_options=NORMALIZE_STRING_LATIN_ASCII | NORMALIZE_STRING_DECOMPOSE)
161    numbers = []
162    values = number_split_regex.split(value)
163    for val in values:
164        val = val.strip()
165        match = number_range_regex.match(val)
166        if match:
167            start_num, end_num = match.groups()
168            start_num_len = len(start_num)
169
170            zfill = 0
171            if start_num.startswith('0'):
172                zfill = start_num_len
173
174            try:
175                start_num = int(start_num)
176                end_num = int(end_num)
177
178                if end_num > start_num:
179                    if end_num - start_num > max_range:
180                        end_num = start_num + max_range
181
182                    for i in xrange(start_num, end_num + 1):
183                        numbers.append(safe_decode(i).zfill(zfill))
184                else:
185                    numbers.append(val.strip().zfill(zfill))
186                    continue
187            except (TypeError, ValueError):
188                numbers.append(safe_decode(val).strip().zfill(zfill))
189                continue
190
191        else:
192            letter_match = letter_range_regex.match(val)
193            if letter_match and parse_letter_range:
194                start_num, end_num = letter_match.groups()
195                start_num = ord(start_num)
196                end_num = ord(end_num)
197                if end_num > start_num:
198                    if end_num - start_num > max_range:
199                        end_num = start_num + max_range
200                    for i in xrange(start_num, end_num + 1):
201                        numbers.append(six.unichr(i))
202                else:
203                    numbers.extend([six.unichr(start_num), six.unichr(end_num)])
204                    continue
205            else:
206                numbers.append(safe_decode(val.strip()))
207    return numbers
208