1''' 2geodata.osm.extract 3------------------- 4 5Extracts nodes/ways/relations, their metadata and dependencies 6from .osm XML files. 7''' 8 9import re 10import six 11import urllib 12import HTMLParser 13 14from collections import OrderedDict 15from lxml import etree 16 17 18from geodata.csv_utils import unicode_csv_reader 19from geodata.text.normalize import normalize_string, NORMALIZE_STRING_DECOMPOSE, NORMALIZE_STRING_LATIN_ASCII 20from geodata.encoding import safe_decode, safe_encode 21 22 23WAY_OFFSET = 10 ** 15 24RELATION_OFFSET = 2 * 10 ** 15 25 26NODE = 'node' 27WAY = 'way' 28RELATION = 'relation' 29 30ALL_OSM_TAGS = set([NODE, WAY, RELATION]) 31WAYS_RELATIONS = set([WAY, RELATION]) 32 33OSM_NAME_TAGS = ( 34 'name', 35 'alt_name', 36 'int_name', 37 'nat_name', 38 'reg_name', 39 'loc_name', 40 'official_name', 41 'commonname', 42 'common_name', 43 'place_name', 44 'short_name', 45) 46 47OSM_BASE_NAME_TAGS = ( 48 'tiger:name_base', 49) 50 51 52def parse_osm(filename, allowed_types=ALL_OSM_TAGS, dependencies=False): 53 ''' 54 Parse a file in .osm format iteratively, generating tuples like: 55 ('node:1', OrderedDict([('lat', '12.34'), ('lon', '23.45')])), 56 ('node:2', OrderedDict([('lat', '12.34'), ('lon', '23.45')])), 57 ('node:3', OrderedDict([('lat', '12.34'), ('lon', '23.45')])), 58 ('node:4', OrderedDict([('lat', '12.34'), ('lon', '23.45')])), 59 ('way:4444', OrderedDict([('name', 'Main Street')]), [1,2,3,4]) 60 ''' 61 f = open(filename) 62 parser = etree.iterparse(f) 63 64 single_type = len(allowed_types) == 1 65 66 for (_, elem) in parser: 67 elem_id = long(elem.attrib.pop('id', 0)) 68 item_type = elem.tag 69 if elem_id >= WAY_OFFSET and elem_id < RELATION_OFFSET: 70 elem_id -= WAY_OFFSET 71 item_type = 'way' 72 elif elem_id >= RELATION_OFFSET: 73 elem_id -= RELATION_OFFSET 74 item_type = 'relation' 75 76 if item_type in allowed_types: 77 attrs = OrderedDict(elem.attrib) 78 attrs['type'] = item_type 79 attrs['id'] = safe_encode(elem_id) 80 81 top_level_attrs = set(attrs) 82 deps = [] if dependencies else None 83 84 for e in elem.getchildren(): 85 if e.tag == 'tag': 86 # Prevent user-defined lat/lon keys from overriding the lat/lon on the node 87 key = e.attrib['k'] 88 if key not in top_level_attrs: 89 attrs[key] = e.attrib['v'] 90 elif dependencies and item_type == 'way' and e.tag == 'nd': 91 deps.append(long(e.attrib['ref'])) 92 elif dependencies and item_type == 'relation' and e.tag == 'member' and 'role' in e.attrib: 93 deps.append((long(e.attrib['ref']), e.attrib.get('type'), e.attrib['role'])) 94 95 key = elem_id if single_type else '{}:{}'.format(item_type, elem_id) 96 yield key, attrs, deps 97 98 if elem.tag in ALL_OSM_TAGS: 99 elem.clear() 100 while elem.getprevious() is not None: 101 del elem.getparent()[0] 102 103 104def osm_type_and_id(element_id): 105 element_id = long(element_id) 106 if element_id >= RELATION_OFFSET: 107 id_type = RELATION 108 element_id -= RELATION_OFFSET 109 elif element_id >= WAY_OFFSET: 110 id_type = WAY 111 element_id -= WAY_OFFSET 112 else: 113 id_type = NODE 114 115 return id_type, element_id 116 117apposition_regex = re.compile('(.*[^\s])[\s]*\([\s]*(.*[^\s])[\s]*\)$', re.I) 118 119html_parser = HTMLParser.HTMLParser() 120 121 122def normalize_wikipedia_title(title): 123 match = apposition_regex.match(title) 124 if match: 125 title = match.group(1) 126 127 title = safe_decode(title) 128 title = html_parser.unescape(title) 129 title = urllib.unquote_plus(title) 130 131 return title.replace(u'_', u' ').strip() 132 133 134def osm_wikipedia_title_and_language(key, value): 135 language = None 136 if u':' in key: 137 key, language = key.rsplit(u':', 1) 138 139 if u':' in value: 140 possible_language = value.split(u':', 1)[0] 141 if len(possible_language) == 2 and language is None: 142 language = possible_language 143 value = value.rsplit(u':', 1)[-1] 144 145 return normalize_wikipedia_title(value), language 146 147 148non_breaking_dash = six.u('[-\u058a\u05be\u1400\u1806\u2010-\u2013\u2212\u2e17\u2e1a\ufe32\ufe63\uff0d]') 149simple_number = six.u('(?:{})?[0-9]+(?:\.[0-9]+)?').format(non_breaking_dash) 150simple_number_regex = re.compile(simple_number, re.UNICODE) 151 152non_breaking_dash_regex = re.compile(non_breaking_dash, re.UNICODE) 153number_range_regex = re.compile(six.u('({}){}({})').format(simple_number, non_breaking_dash, simple_number), re.UNICODE) 154letter_range_regex = re.compile(r'([^\W\d_]){}([^\W\d_])'.format(non_breaking_dash.encode('unicode-escape')), re.UNICODE) 155 156number_split_regex = re.compile('[,;]') 157 158 159def parse_osm_number_range(value, parse_letter_range=True, max_range=100): 160 value = normalize_string(value, string_options=NORMALIZE_STRING_LATIN_ASCII | NORMALIZE_STRING_DECOMPOSE) 161 numbers = [] 162 values = number_split_regex.split(value) 163 for val in values: 164 val = val.strip() 165 match = number_range_regex.match(val) 166 if match: 167 start_num, end_num = match.groups() 168 start_num_len = len(start_num) 169 170 zfill = 0 171 if start_num.startswith('0'): 172 zfill = start_num_len 173 174 try: 175 start_num = int(start_num) 176 end_num = int(end_num) 177 178 if end_num > start_num: 179 if end_num - start_num > max_range: 180 end_num = start_num + max_range 181 182 for i in xrange(start_num, end_num + 1): 183 numbers.append(safe_decode(i).zfill(zfill)) 184 else: 185 numbers.append(val.strip().zfill(zfill)) 186 continue 187 except (TypeError, ValueError): 188 numbers.append(safe_decode(val).strip().zfill(zfill)) 189 continue 190 191 else: 192 letter_match = letter_range_regex.match(val) 193 if letter_match and parse_letter_range: 194 start_num, end_num = letter_match.groups() 195 start_num = ord(start_num) 196 end_num = ord(end_num) 197 if end_num > start_num: 198 if end_num - start_num > max_range: 199 end_num = start_num + max_range 200 for i in xrange(start_num, end_num + 1): 201 numbers.append(six.unichr(i)) 202 else: 203 numbers.extend([six.unichr(start_num), six.unichr(end_num)]) 204 continue 205 else: 206 numbers.append(safe_decode(val.strip())) 207 return numbers 208