1# -*- perl -*- 2 3# 4# Author: Slaven Rezic 5# 6# Copyright (C) 1998, 2000, 2001, 2002, 2003, 2004, 2010 Slaven Rezic. All rights reserved. 7# This package is free software; you can redistribute it and/or 8# modify it under the same terms as Perl itself. 9# 10# Mail: slaven@rezic.de 11# WWW: http://bbbike.sourceforge.net 12# 13 14package PLZ; 15 16use 5.005; # qr{} 17 18use strict; 19# Setting $OLD_AGREP to a true value really means: use String::Approx 20# instead or no agrep at all. 21use vars qw($PLZ_BASE_FILE @plzfile $OLD_AGREP $VERSION $VERBOSE $sep); 22use locale; 23use BBBikeUtil; 24use Strassen::Strasse; 25 26$VERSION = 1.76; 27 28use constant FMT_NORMAL => 0; # /usr/www/soc/plz/Berlin.data 29use constant FMT_REDUCED => 1; # ./data/Berlin.small.data (does not exist anymore) 30use constant FMT_COORDS => 2; # ./data/Berlin.coords.data 31use constant FMT_COORDS_WITH_INDEX => 3; # PLZ::Multi with -addindex option 32 33# agrep says that 32 is the max length, but experiments show something else: 34use constant AGREP_LONGEST_RX => 29; 35 36$PLZ_BASE_FILE = "Berlin.coords.data" if !defined $PLZ_BASE_FILE; 37 38# XXX use BBBikeUtil::bbbike_root().'/data'!!! 39@plzfile = 40 ((map { "$_/$PLZ_BASE_FILE" } @Strassen::datadirs), 41 BBBikeUtil::bbbike_root().'/data/'.$PLZ_BASE_FILE, 42 (map { ("$_/$PLZ_BASE_FILE", "$_/data/$PLZ_BASE_FILE") } @INC), 43 (map { ("$_/berlinco.dat", 44 "$_/Berlin.data", "$_/data/Berlin.data") } @INC), 45 ) if !@plzfile; 46$OLD_AGREP = 0 unless defined $OLD_AGREP; 47# on FreeBSD is 48# ports/textproc/agrep => agrep 2.04 with buggy handling of umlauts 49# ports/textproc/glimpse => agrep 3.0 50 51# indexes of file fields 52use constant FILE_NAME => 0; 53use constant FILE_CITYPART => 1; 54use constant FILE_ZIP => 2; # this is not valid for FMT_NORMAL 55use constant FILE_COORD => 3; # the "identification" coordinate 56use constant FILE_INDEX => 4; 57use constant FILE_STRTYPE => 5; # This is a placeholder, and not implemented now! 58 59use constant FILE_ZIP_FMT_NORMAL => 4; # this is only valid for FMT_NORMAL 60 61$sep = '|'; 62 63use constant SA_ANCHOR_LENGTH => 3; # use 0 to turn off String::Approx anchor hack 64use constant SA_ANCHOR_HACK => "�" x SA_ANCHOR_LENGTH; # use a rare character 65 66sub new { 67 my($class, $file) = @_; 68 my $self = {}; 69 if (!defined $file) { 70 foreach (@plzfile) { 71 if (-r $_ && open(DATA, $_)) { 72 $file = $_; 73 $self->{IsGzip} = 0; 74 } elsif (-r "$_.gz") { 75 if (is_in_path("gzip") && -d "/tmp" && -w "/tmp") { 76 require File::Basename; 77 my $dest = "/tmp/" . File::Basename::basename($_); 78 system("gzip -dc $_ > $dest"); 79 if (open(DATA, $dest)) { 80 if ($?/256 == 0) { 81 $file = $dest; 82 $self->{WasGzip} = 1; 83 } 84 } else { 85 warn "Cannot open $dest: $!"; 86 } 87 } 88 if (!defined $file) { 89 warn "Gzip file $_.gz cannot be handled"; 90 } 91 } 92 last if defined $file; 93 } 94 } elsif (defined $file) { 95 open(DATA, $file) or return undef; 96 } else { 97 return undef; 98 } 99 100 binmode DATA; 101 my($line) = <DATA>; 102 $line =~ s/[\015\012]//g; 103# Automatic detection of format. Caution: this means that the first line 104# in Berlin.coords.data must be complete i.e. having the coords field defined! 105 my(@l) = split(/\|/, $line); 106 if (@l == 3) { 107 $self->{DataFmt} = FMT_REDUCED; 108 $self->{FieldPLZ} = FILE_ZIP; 109 } elsif (@l == 4) { 110 $self->{DataFmt} = FMT_COORDS; 111 $self->{FieldPLZ} = FILE_ZIP; 112 } elsif (@l == 5) { 113 $self->{DataFmt} = FMT_COORDS_WITH_INDEX; 114 $self->{FieldPLZ} = FILE_ZIP; 115 } else { 116 $self->{DataFmt} = FMT_NORMAL; 117 $self->{FieldPLZ} = FILE_ZIP_FMT_NORMAL; 118 } 119 close DATA; 120 121 $self->{File} = $file; 122 $self->{Sep} = '|'; # XXX not yet used 123 bless $self, $class; 124} 125 126# Load the data into $self->{Data}. Not necessary for nearly all other 127# methods. 128sub load { 129 my($self, %args) = @_; 130 my $file = $args{File} || $self->{File}; 131 if (do { local $^W = 0; $file ne $self->{Data} }) { # XXX h�h??? 132 my @data; 133 open(PLZ, $file) 134 or die "Die Datei $file kann nicht ge�ffnet werden: $!"; 135 binmode PLZ; 136 137 my $code = <<'EOF'; 138 while(<PLZ>) { 139 chomp; 140 my(@l) = split(/\|/, $_); 141EOF 142 my $push_code; 143 if ($self->{DataFmt} == FMT_REDUCED) { 144 $push_code = q{push @data, 145 [@l[FILE_NAME, FILE_CITYPART, FILE_ZIP]]}; 146 } elsif ($self->{DataFmt} == FMT_COORDS) { 147 $push_code = q{push @data, 148 [@l[FILE_NAME, FILE_CITYPART, FILE_ZIP, FILE_COORD]]}; 149 } elsif ($self->{DataFmt} == FMT_COORDS_WITH_INDEX) { 150 $push_code = q{push @data, 151 [@l[FILE_NAME, FILE_CITYPART, FILE_ZIP, FILE_COORD, FILE_INDEX]]}; 152 } else { 153 $push_code = q{push @data, 154 [@l[FILE_NAME, FILE_CITYPART, FILE_ZIP_FMT_NORMAL]]}; 155 } 156 $code .= $push_code . <<'EOF'; 157 } 158EOF 159 eval $code; 160 close PLZ; 161 $self->{Data} = \@data; 162 $self->{File} = $file; 163 undef $self->{NameHash}; 164 undef $self->{PlzHash}; 165 } 166} 167 168sub make_plz_re { 169 my($self, $plz) = @_; 170 if ($self->{DataFmt} == FMT_REDUCED || 171 $self->{DataFmt} == FMT_COORDS || 172 $self->{DataFmt} == FMT_COORDS_WITH_INDEX) { 173 '^[^|]*|[^|]*|' . $plz; 174 } else { 175 '^[^|]*|[^|]*|[^|]*|[^|]*|' . $plz . '|'; 176 } 177} 178 179# indexes of return values 180use constant LOOK_NAME => 0; 181use constant LOOK_CITYPART => 1; 182use constant LOOK_ZIP => 2; 183use constant LOOK_COORD => 3; 184use constant LOOK_INDEX => 4; 185use constant LOOK_STRTYPE => 5; # This is a placeholder, and not implemented now! 186 187# XXX make gzip-aware 188# Argumente: (Beschreibung fehlt XXX) 189# Agrep/GrepType 190# Noextern 191# NoStringApprox 192# Citypart (optionale Einschr�nkung auf einen Bezirk oder Postleitzahl, 193# may also be an array reference to a number of cityparts) 194# MultiCitypart - empfehlenswert, wenn Citypart eine Postleitzahl ist! 195# MultiZIP 196# Ausgabe: Array von Referenzen [strasse, bezirk, plz, "x,y-Koordinate"] 197# Je nach Format der Quelldatei ($self->{DataFmt}) fehlt die x,y-Koordinate 198sub look { 199 my($self, $str, %args) = @_; 200 201 my $file = $args{File} || $self->{File}; 202 my %valid_cityparts; 203 if (defined $args{Citypart} && length $args{Citypart}) { 204 %valid_cityparts = map { (lc $_,1) } ref $args{Citypart} eq 'ARRAY' ? @{ $args{Citypart} } : $args{Citypart}; 205 } 206 207 my @res; 208 209 # Windows usually does not have grep and agrep externally 210 if ($^O eq 'MSWin32' && !exists $args{Noextern}) { 211 $args{Noextern} = 1; 212 } 213 214 print STDERR "->look($str, " . join(" ", %args) .") in '$file'\n" if $VERBOSE; 215 216 #XXX use fgrep instead of grep? slightly faster, no quoting needed! 217 my $grep_type = ($args{Agrep} ? 'agrep' : ($args{GrepType} || 'grep')); 218 my @push_inx; 219 if ($self->{DataFmt} == FMT_NORMAL) { 220 @push_inx = (FILE_NAME, FILE_CITYPART, $self->{FieldPLZ}); 221 } elsif ($self->{DataFmt} == FMT_REDUCED) { 222 @push_inx = (FILE_NAME, FILE_CITYPART, FILE_ZIP); 223 } elsif ($self->{DataFmt} == FMT_COORDS_WITH_INDEX) { 224 @push_inx = (FILE_NAME, FILE_CITYPART, FILE_ZIP, FILE_COORD, FILE_INDEX); 225 } else { 226 @push_inx = (FILE_NAME, FILE_CITYPART, FILE_ZIP, FILE_COORD); 227 } 228 if ($grep_type eq 'agrep') { 229 if ($OLD_AGREP || 230 (!$args{Noextern} && !is_in_path('agrep')) || 231 length($str) > AGREP_LONGEST_RX # otherwise there are "pattern too long" errors 232 # XXX AGREP_LONGEST_RX is not perfect --- the string is rx-escaped, see below 233 ) { 234 $args{Noextern} = 1; 235 } 236 if ($args{Noextern}) { 237 eval q{local $SIG{'__DIE__'}; 238 die "Won't use String::Approx" if $args{NoStringApprox}; 239 require String::Approx; 240 String::Approx->VERSION(2.7); 241 }; 242 if ($@) { 243 if ($args{Agrep} == 1) { 244 $grep_type = 'grep-umlaut'; 245 } else { 246 $grep_type = 'grep'; 247 } 248 } 249 } 250 } 251 if ($grep_type eq 'grep') { 252 if (!$args{Noextern} && !is_in_path('grep')) { 253 $args{Noextern} = 1; 254 } 255 } 256 257 my %res; 258 my $push_sub = sub { 259 my(@to_push) = (split(/\|/, $_[FILE_NAME]))[@push_inx]; 260 if (($args{MultiCitypart}|| 261 $to_push[FILE_CITYPART] eq "" || 262 !exists $res{$to_push[FILE_NAME]}->{$to_push[FILE_CITYPART]}) && 263 ($args{MultiZIP} || 264 $to_push[FILE_ZIP] eq "" || 265 !exists $res{$to_push[FILE_NAME]}->{$to_push[FILE_ZIP]}) 266 ) { 267 # filter by citypart (Bezirk) or ZIP 268 return if (keys %valid_cityparts && 269 !($valid_cityparts{lc $to_push[FILE_CITYPART]} || 270 $valid_cityparts{$to_push[FILE_ZIP]}) 271 ); 272 273 push @res, [@to_push]; 274 return if defined $args{Max} and $args{Max} < $#res; 275 $res{$to_push[FILE_NAME]}->{$to_push[FILE_CITYPART]}++; 276 $res{$to_push[FILE_NAME]}->{$to_push[FILE_ZIP]}++; 277 } 278 }; 279 280 if (!$args{Noextern} && $grep_type =~ /^a?grep$/) { 281 unless ($args{Noquote}) { 282 if ($grep_type eq 'grep') { 283 # XXX quotemeta verwenden? 284 $str =~ s/([\\.*\[\]])/\\$1/g; # quote metacharacters 285 } else { # agrep 286 $str =~ s/([\$\^\*\[\]\^\|\(\)\!\`\,\;])/\\$1/g; 287 } 288 $str = "^$str"; 289 } 290 291 # limitation of agrep: 292 if ($grep_type eq 'agrep' && length($str) > AGREP_LONGEST_RX) { 293 $str = substr($str, 0, AGREP_LONGEST_RX); 294 $str =~ s/\\$//; # remove a (lonely?) backslash at the end 295 # XXX but this will be wrong if it's really a \\ 296 } 297 298 if (eval { require Encode; Encode::is_utf8($str) }) { 299 $str = Encode::encode("iso-8859-1", $str); 300 } 301 my(@grep_args) = ('-i', $str, $file); 302 if ($grep_type eq 'agrep' && $args{Agrep}) { 303 unshift @grep_args, "-$args{Agrep}"; 304 } 305 my @cmd = ($grep_type, @grep_args); 306 warn "About to call <@cmd>" if $VERBOSE; 307 CORE::open(PLZ, "-|") or do { 308 $ENV{LANG} = $ENV{LC_ALL} = $ENV{LC_CTYPE} = 'C'; 309 # agrep emits some warnings "using working-directory '...' 310 # to locate dictionaries" if it does not have a $ENV{HOME} 311 # (which is probably a bug, because dictionaries are not 312 # used at all) 313 $ENV{HOME} = "/something"; 314 exec @cmd; 315 warn "While doing @cmd: $!"; 316 require POSIX; 317 POSIX::_exit(1); # avoid running any END blocks 318 }; 319 my %res; 320 binmode PLZ; 321 while(<PLZ>) { 322 chomp; 323 $push_sub->($_); 324 } 325 close PLZ; 326 } else { 327 CORE::open(PLZ, $file) 328 or die "Die Datei $file kann nicht ge�ffnet werden: $!"; 329 binmode PLZ; 330 if ($grep_type eq 'agrep') { 331 chomp(my @data = <PLZ>); 332 close PLZ; 333 my %res; 334 if (@data) { 335 foreach (map { substr $_, SA_ANCHOR_LENGTH } 336 String::Approx::amatch(SA_ANCHOR_HACK . $str, 337 ['i', $args{Agrep}], 338 map { SA_ANCHOR_HACK . $_ } @data)) { 339 $push_sub->($_); 340 } 341 } 342 } elsif ($grep_type =~ m{^grep-(umlaut|inword|substr)$}) { 343 my $sub_type = $1; 344 if ($sub_type eq 'umlaut') { 345 $str = '(?i:^' . quotemeta(BBBikeUtil::umlauts_to_german($str)) . ')'; 346 } elsif ($sub_type eq 'inword') { 347 $str = '(?i:\b' . quotemeta(BBBikeUtil::umlauts_to_german($str)) . '\b)'; 348 } elsif ($sub_type eq 'substr') { 349 $str = '(?i:' . quotemeta(BBBikeUtil::umlauts_to_german($str)) . ')'; 350 } 351 $str = qr{$str}; 352 while(<PLZ>) { 353 chomp; 354 if (BBBikeUtil::umlauts_to_german($_) =~ $str) { 355 $push_sub->($_); 356 } 357 } 358 close PLZ; 359 } else { 360 $str = quotemeta($str) unless $args{Noquote}; 361 $str = "^$str" unless $args{Noquote}; 362#XXX del? $str =~ s/\|/\\|/g; 363 $str = '(?i:' . $str . ')'; 364 $str = qr{$str}; 365 my %res; 366 while(<PLZ>) { 367 chomp; 368 if ($_ =~ $str) { 369 $push_sub->($_); 370 } 371 } 372 close PLZ; 373 } 374 } 375 376 @res; 377} 378 379# Argument: an array of references (the output of look()) 380# Combine records which form the same street (though same identification coordinate) 381# Returned value has the same format as the input 382# 383# Historical note: before 2010-07 this function did also guesses by 384# checking same citypart and/or same zip code. Unfortunately there are 385# actually two pairs of same-named streets in Berlin (Schoenhauser 386# Str. and Waldstr.) which have the same zip code though being 387# different. Previously Berlin.coords.data did not use the coordinate 388# as an id. 389sub combine { 390 my($self, @in) = @_; 391 my %out; 392 my @copy_indexes = (LOOK_NAME, LOOK_COORD); 393 if ($self->{DataFmt} eq FMT_COORDS_WITH_INDEX) { 394 push @copy_indexes, LOOK_INDEX; 395 } 396 CHECK_IT: 397 foreach my $s (@in) { 398 if (exists $out{$s->[LOOK_NAME]}) { 399 foreach my $r (@{ $out{$s->[LOOK_NAME]} }) { 400 my $eq_coord = $s->[LOOK_COORD] && $s->[LOOK_COORD] eq $r->[LOOK_COORD]; 401 if ($eq_coord) { 402 my $eq_cp = grep { $s->[LOOK_CITYPART] eq $_ } grep { $_ ne "" } @{ $r->[LOOK_CITYPART] }; 403 my $eq_zp = grep { $s->[LOOK_ZIP] eq $_ } grep { $_ ne "" } @{ $r->[LOOK_ZIP] }; 404 push @{ $r->[LOOK_CITYPART] }, $s->[LOOK_CITYPART] 405 unless $eq_cp; 406 push @{ $r->[LOOK_ZIP] }, $s->[LOOK_ZIP] 407 unless $eq_zp; 408 next CHECK_IT; 409 } 410 } 411 } 412 # does not exist or is a new citypart/zip combination 413 my $r = []; 414 $r->[$_] = $s->[$_] for (@copy_indexes); 415 $r->[LOOK_CITYPART] = [ $s->[LOOK_CITYPART] ]; 416 $r->[LOOK_ZIP] = [ $s->[LOOK_ZIP ] ]; 417 push @{ $out{$s->[LOOK_NAME]} }, $r; 418 } 419 map { @$_ } values %out; 420} 421 422# converts an array element from combine from 423# ["Hauptstr.", ["Friedenau","Schoeneberg],[10827,12159], $coord] 424# to 425# ["Hauptstr.", "Friedenau, Schoeneberg", "10827,12159", $coord] 426sub combined_elem_to_string_form { 427 my($self, $elem) = @_; 428 my @copy_indexes = (LOOK_NAME, LOOK_COORD); 429 if ($self->{DataFmt} eq FMT_COORDS_WITH_INDEX) { 430 push @copy_indexes, LOOK_INDEX; 431 } 432 my $r = []; 433 $r->[$_] = $elem->[$_] for (@copy_indexes); 434 $r->[LOOK_CITYPART] = join(", ", @{$elem->[LOOK_CITYPART]}); 435 $r->[LOOK_ZIP] = join(", ", @{$elem->[LOOK_ZIP]}); 436 $r; 437} 438 439# Split a street specification like "Heerstr. (Charlottenburg, Spandau)" 440# to the street component and the citypart components 441sub split_street { 442 my $street = shift; 443 my @cityparts; 444 ($street, @cityparts) = Strasse::split_street_citypart($street); 445 if (@cityparts) { 446 ($street, Citypart => \@cityparts); 447 } else { 448 ($street); 449 } 450} 451 452# Match-Reihenfolge: 453# * nicht modifiziert ohne Agrep 454# * "strasse" nach "str." umgewandelt ohne Agrep 455# * inkrementell bis $args{Agrep} mit Agrep abwechselnd nicht modifiziert und 456# mit s/strasse/str./ 457# Argumente in %args: 458# Agrep: 0, wenn grep verwendet werden soll 459# >0, wenn mit Fehlern gesucht werden, dann gibt der Wert die 460# maximale Anzahl der erlaubten Fehler an 461# 'default', wenn der Standardwert von 3 Fehlern genommen werden soll 462# Bei l�ngeren W�rtern wird die Maximalanzahl bis 5 erh�ht. 463# Sonstige Argumente werden nach look() durchgereicht. 464# Ausgabe: 465# erstes Element: siehe look() (als Arrayreferenz) 466# zweites Element: Anzahl der Fehler f�r das Ergebnis 467# Wenn $args{LookCompat} gesetzt ist, dann ist die Ausgabe genau wie bei 468# look(). 469sub look_loop { 470 my($self, $str, %args) = @_; 471 my $max_agrep; 472 if (defined $args{Agrep} && $args{Agrep} eq 'default') { 473 $max_agrep = 3; 474 # Allow more errors for longer strings: 475 if (length($str) > 15) { $max_agrep = 4 } 476 elsif (length($str) > 25) { $max_agrep = 5 } 477 delete $args{Agrep}; 478 } else { 479 $max_agrep = delete $args{Agrep} || 0; 480 } 481 482 my $agrep = 0; 483 my @matchref; 484 # 1. Try unaltered 485 @matchref = $self->look($str, %args); 486 if (!@matchref) { 487 # 2. Try to strip house number 488 if (my $str0 = _strip_hnr($str)) { 489 @matchref = $self->look($str0, %args); 490 } 491 # 3. Try to strip "stra�e" => "str." 492 # 3b. Strip house number 493 if (!@matchref) { 494 if (my $str0 = _strip_strasse($str)) { 495 @matchref = $self->look($str0, %args); 496 if (!@matchref) { 497 if ($str0 = _strip_hnr($str0)) { 498 @matchref = $self->look($str0, %args); 499 } 500 } 501 } 502 } 503 # 4. Try to expand "Str." on beginning of the string 504 # 4b. Strip house number 505 if (!@matchref) { 506 if (my $str0 = _expand_strasse($str)) { 507 @matchref = $self->look($str0, %args); 508 if (!@matchref) { 509 if ($str0 = _strip_hnr($str0)) { 510 @matchref = $self->look($str0, %args); 511 } 512 } 513 } 514 } 515 # 5. Try word match in the middle of the string 516 if (!@matchref && length $str >= 4) { 517 my %args = %args; 518 delete $args{Agrep}; 519 $args{GrepType} = "grep-inword"; 520 @matchref = $self->look($str, %args); 521 } 522 # 6. Use increasing approximate match. Try first unaltered, then 523 # with stripped street, then without house number. 524 if (!@matchref) { 525 $agrep = 1; 526 while ($agrep <= $max_agrep) { 527 @matchref = $self->look($str, %args, Agrep => $agrep); 528 if (!@matchref && (my $str0 = _strip_strasse($str))) { 529 @matchref = $self->look($str0, %args, Agrep => $agrep); 530 } 531 if (!@matchref && (my $str0 = _strip_hnr($str))) { 532 @matchref = $self->look($str0, %args, Agrep => $agrep); 533 } 534 { 535 my $str0; 536 if (!@matchref 537 && ($str0 = _strip_strasse($str))) { 538 @matchref = $self->look($str0, %args, Agrep => $agrep); 539 if (!@matchref 540 && ($str0 = _strip_hnr($str0))) { 541 @matchref = $self->look($str0, %args, Agrep => $agrep); 542 } 543 } 544 } 545 { 546 my $str0; 547 if (!@matchref 548 && ($str0 = _expand_strasse($str))) { 549 @matchref = $self->look($str0, %args, Agrep => $agrep); 550 if (!@matchref 551 && ($str0 = _strip_hnr($str0))) { 552 @matchref = $self->look($str0, %args, Agrep => $agrep); 553 } 554 } 555 } 556 last if @matchref; 557 $agrep++; 558 } 559 } 560 } 561 if ($args{LookCompat}) { 562 @matchref; 563 } else { 564 (\@matchref, $agrep); 565 } 566} 567 568sub _strip_strasse { 569 my $str = shift; 570 if ($str =~ /stra(?:ss|�)e/i) { 571 $str =~ s/(s)tra(?:ss|�)e/$1tr./i; 572 $str; 573 } else { 574 undef; 575 } 576} 577 578sub _strip_hnr { 579 my $str = shift; 580 # This strips input like "Stra�e 1a" or "Stra�e 1-2". Maybe 581 # also strip "Stra�e 1 a"? XXX 582 if ($str =~ m{\s+(?:\d+[a-z]?|\d+\s*[-/]\s*\d+)\s*$}) { 583 $str =~ s{\s+(?:\d+[a-z]?|\d+\s*[-/]\s*\d+)\s*$}{}; 584 $str; 585 } else { 586 undef; 587 } 588} 589 590sub _expand_strasse { 591 my $str = shift; 592 my $replaced = 0; 593 if ($str =~ s/^(U\+S|S\+U)[- ](?:Bahnhof|Bhf\.?)\s+/S-Bhf /i) { # Choose one 594 $replaced++; 595 } elsif ($str =~ s/^(U\+S|S\+U)\s+/S-Bhf /i) { # Choose one 596 $replaced++; 597 } elsif ($str =~ s/^([US])[- ](?:Bahnhof|Bhf\.?)\s+/uc($1)."-Bhf "/ie) { 598 $replaced++; 599 } elsif ($str =~ s/^([US])Bhf\.?\s+/uc($1)."-Bhf "/ie) { # without space or dash... 600 $replaced++; 601 } elsif ($str =~ s/^([US])\s+/uc($1)."-Bhf "/ie) { 602 $replaced++; 603 } 604 if ($str =~ s/^(k)l\.?\s+(.*str)/$1leine $2/i) { 605 $replaced++; 606 } elsif ($str =~ s/^(g)r\.?\s+(.*str)/$1ro�e $2/i) { 607 $replaced++; 608 } 609 if ($str =~ /^\s*str\.(\S)?/i) { 610 if (defined $1) { # add space 611 $str =~ s/^\s*(s)tr\./$1tra�e /i; 612 } else { 613 $str =~ s/^\s*(s)tr\./$1tra�e/i; 614 } 615 $replaced++; 616 $str; 617 } elsif ($str =~ s/^\s*(s)trasse/$1tra�e/i) { 618 $replaced++; 619 $str; 620 } elsif ($replaced) { 621 $str; 622 } else { 623 undef; 624 } 625} 626 627# Sortiert die Stra�en eines look_loop-Ergebnisses. 628# Argumente und R�ckgabewerte sind vom gleichen Format wie bei look_loop. 629sub look_loop_best { 630 my($self, $str, %args) = @_; 631 my $look_compat = delete $args{LookCompat}; 632 my($matchref, $agrep) = $self->look_loop($str, %args); 633 if (@$matchref) { 634 my @rating; 635 my $str_rx = qr{(?i:^\Q$str\E)}; 636 for(my $i=0; $i<=$#$matchref; $i++) { 637 my $item = $matchref->[$i]; 638 if ($item->[LOOK_NAME] eq $str) { 639 push @rating, [ 100, $item ]; 640 } elsif ($item->[LOOK_NAME] =~ $str_rx) { 641 push @rating, [ 40 + 40-length($item->[LOOK_NAME]), $item ]; 642 } else { 643 push @rating, [ 40-length($item->[LOOK_NAME]), $item ]; 644 } 645 } 646 $matchref = [map { $_->[1] } sort { $b->[0] <=> $a->[0] } @rating]; 647 } 648 if ($look_compat) { 649 @$matchref; 650 } else { 651 ($matchref, $agrep); 652 } 653} 654 655# In: an array of indexes FILE_... 656# Out: a hashref $hash->{VAL_INDEX_1}{VAL_INDEX_2}{...} = [$pos1, $pos2, ...] 657sub make_any_hash { 658 my($self, @indexes) = @_; 659 die "Please call the load() method first" if !$self->{Data}; 660 my %hash; 661 my $i = 0; 662 foreach my $datarec (@{$self->{Data}}) { 663 my $h = \%hash; 664 for(my $index_i = 0; $index_i <= $#indexes; $index_i++) { 665 my $field_val = $datarec->[$indexes[$index_i]]; 666 if ($index_i == $#indexes) { 667 push @{$h->{$field_val}}, $i; 668 } else { 669 $h = $h->{$field_val} ||= {}; 670 } 671 } 672 $i++; 673 } 674 \%hash; 675} 676 677sub as_streets { 678 my $self = shift; 679 my(%args) = @_; 680 my $cat = $args{Cat} || 'X'; 681 682 my @data; 683 684 if ($self->{DataFmt} ne FMT_COORDS) { 685 die "Only PLZ format FMT_COORDS (".FMT_COORDS.") is supported, not " . $self->{DataFmt}; 686 } 687 CORE::open(F, $self->{File}) or die "Can't open $self->{File}: $!"; 688 binmode F; 689 while(<F>) { 690 chomp; 691 my(@f) = split /\|/; 692 push @data, $f[FILE_NAME]." (".$f[FILE_CITYPART].", ".$f[FILE_ZIP].")\t$cat ".$f[FILE_COORD]."\n" 693 if defined $f[FILE_COORD] && $f[FILE_COORD] ne ''; 694 } 695 close F; 696 697 require Strassen; 698 my $s = Strassen->new_from_data_ref(\@data); 699 $s->{File} = $self->{File}; 700 $s; 701} 702 703# convert Strassen.pm object to PLZ.pm data file 704# my $new_data = PLZ->new_data_from_streets(new Strassen ...); 705sub new_data_from_streets { 706 my($class, $s) = @_; 707 my $ret = ""; 708 $s->init; 709 while(1) { 710 my $r = $s->next; 711 last if !@{ $r->[Strassen::COORDS()] }; 712 my($street, %args) = split_street($r->[Strassen::NAME()]); 713 $ret .= "$street$sep"; 714 if ($args{Citypart}) { 715 $ret .= join(", ", @{ $args{Citypart} }); 716 } 717 $ret .= "$sep$sep"; 718 $ret .= $r->[Strassen::COORDS()][$#{$r->[Strassen::COORDS()]}/2]; 719 $ret .= "\n"; 720 } 721 $ret; 722} 723 724sub zip_to_cityparts_hash { 725 my($self, %args) = @_; 726 my $cachebase; 727 my $h; 728 if ($args{UseCache}) { 729 require Strassen::Util; 730 require File::Basename; 731 $cachebase = "zip_to_cityparts_" . File::Basename::basename($self->{File}); 732 $h = Strassen::Util::get_from_cache($cachebase, [$self->{File}]); 733 if ($h) { 734 warn "Using cache for $cachebase\n" if $VERBOSE; 735 return $h; 736 } 737 } 738 739 my $hh; 740 open(PLZ, $self->{File}) 741 or die "Die Datei $self->{File} kann nicht ge�ffnet werden: $!"; 742 binmode PLZ; 743 while(<PLZ>) { 744 chomp; 745 my(@l) = split(/\|/, $_); 746 if ($l[FILE_ZIP] ne "" && $l[FILE_CITYPART] ne "") { 747 $hh->{$l[FILE_ZIP]}{$l[FILE_CITYPART]}++; 748 } 749 } 750 close PLZ; 751 752 while(my($k,$v) = each %$hh) { 753 $h->{$k} = [keys %$v]; 754 } 755 756 if (defined $cachebase) { 757 Strassen::Util::write_cache($h, $cachebase); 758 warn "Wrote cache ($cachebase)\n" if $VERBOSE; 759 } 760 $h; 761} 762 763sub norm_street { 764 my $str = shift; 765 $str =~ s/(s)tra(?:ss|�)e$/$1tr\./i; # XXX more? 766 $str =~ s/^\s+//; 767 $str =~ s/\s+$//; 768 $str =~ s/ +/ /g; 769 $str; 770} 771 772sub streets_hash { 773 my $self = shift; 774 my %hash; 775 open(D, $self->{File}) or die "Can't open $self->{File}: $!"; 776 binmode D; 777 my $pos = tell(D); 778 while(<D>) { 779 chomp; 780 /^(.+?)\|/; 781 my $l = $1; 782 if (!exists $hash{$l}) { 783 $hash{$l} = $pos; 784 } 785 $pos = tell(D); 786 } 787 close D; 788 \%hash; 789} 790 791sub street_words_hash { 792 my $self = shift; 793 my %hash; 794 open(D, $self->{File}) or die "Can't open $self->{File}: $!"; 795 binmode D; 796 my $pos = tell(D); 797 while(<D>) { 798 chomp; 799 /^(.+?)\|/; 800 my @s = split /\s+/, $1; 801 my $h = \%hash; 802 for my $i (0 .. $#s) { 803 if (!exists $h->{$s[$i]}) { 804 if ($i == $#s) { 805 $h->{$s[$i]} = $pos; 806 } else { 807 $h->{$s[$i]} = {}; 808 $h = $h->{$s[$i]}; 809 } 810 } else { 811 my $old_h = $h->{$s[$i]}; 812 if (!UNIVERSAL::isa($old_h, 'HASH')) { 813 $h->{$s[$i]} = {"" => $old_h}; 814 $old_h = $h->{$s[$i]}; 815 } 816 if ($i == $#s) { 817 if (!exists $old_h->{""}) { 818 $old_h->{""} = $pos; 819 } 820 } else { 821 $h = $h->{$s[$i]}; 822 } 823 } 824 } 825 $pos = tell(D); 826 } 827 close D; 828 \%hash; 829} 830 831# Arguments: 832# $text: string to examine 833# $h: result of street_words_hash 834# XXX still a simple-minded solution 835sub find_streets_in_text { 836 my($self, $text, $h) = @_; 837 $h = $self->{StreetWordsHash} if !$h; 838 my @res; 839 my @s = split /(\s+)/, $text; 840 my $begin = 0; 841 my $length; 842 for(my $i = 0; $i <= $#s; $i+=2) { 843 $length = length($s[$i]); 844 if ($s[$i] =~ /^(s)tra(?:ss|�)e$/i) { 845 $s[$i] = "$1tr."; 846 } 847 my $ii = 0; 848 if (exists $h->{$s[$i]}) { 849 my $s = $s[$i]; 850 my $hh = $h->{$s[$i]}; 851 while (1) { 852 if (!UNIVERSAL::isa($hh, 'HASH')) { 853 push @res, [$s, $begin, $length]; 854 last; 855 } 856 if (!exists $hh->{$s[$i+$ii+2]}) { 857 if (exists $hh->{""}) { 858 push @res, [$s, $begin, $length]; 859 } 860 last; 861 } 862 $ii+=2; 863 $s .= " $s[$i+$ii]"; 864 $length += length($s[$i+$ii-1]) + length($s[$i+$ii]); 865 $hh = $hh->{$s[$i+$ii]}; 866 } 867 } 868 869 $i += $ii; 870 $begin += $length; 871 if (defined $s[$i+1]) { 872 $begin += length($s[$i+1]); 873 } 874 } 875 \@res; 876} 877 878sub get_street_type { 879 my($self, $look_result) = @_; 880 if (defined $look_result->[LOOK_STRTYPE]) { # This is not yet defined, but maybe some day? 881 $look_result->[LOOK_STRTYPE]; 882 } else { 883 my $name = $look_result->[LOOK_NAME]; 884 if ($name =~ m{(^Kolonie\s 885 |^KGA\s 886 |\s\(Kolonie\)$ 887 )}x) { 888 return 'orchard'; 889 } elsif ($name =~ m{^[SU]-Bhf\.?\s}) { 890 return 'railway station'; 891 } elsif ($name =~ m{\s\(Park\)$}) { 892 return 'park'; 893 } elsif ($name =~ m{\s\(Gastst�tte\)$}) { 894 return 'restaurant'; 895 } elsif ($name =~ m{\s\(Siedlung\)$}) { 896 return 'settlement'; # XXX English wording? 897 } elsif ($name =~ m{(?:^Insel\s|\s\(Insel\)$)}) { 898 return 'island'; 899 } elsif ($name =~ m{\s\(geplant\)$}) { 900 return 'projected street'; 901 } else { 902 return 'street'; 903 } 904 } 905} 906 907# This method may be removed or renamed one day! 908sub _populate_street_type { 909 my($self, $look_result) = @_; 910 my $type = $self->get_street_type($look_result); 911 $look_result->[LOOK_STRTYPE] = $type; 912} 913 914return 1 if caller(); 915 916###################################################################### 917# 918# standalone program 919# 920package main; 921require Getopt::Long; 922 923my $agrep = "default"; 924my $extern = 1; 925my $citypart; 926my $multi_citypart = 0; 927my $multi_zip = 0; 928my $grep_type; 929 930if (!Getopt::Long::GetOptions 931 ("agrep=i" => \$agrep, 932 "extern!" => \$extern, 933 "citypart=s" => \$citypart, 934 "multicitypart!" => \$multi_citypart, 935 "multizip!" => \$multi_zip, 936 "greptype=s" => \$grep_type, 937 "v!" => \$PLZ::VERBOSE, 938 ) 939 ) { 940 die "Usage: $0 [-v] [-agrep errors] [-greptype grep-inword|grep-umlaut|...] 941 [-extern] [-citypart citypart] 942 [-multicitypart] [-multizip] street 943"; 944} 945 946my $street = shift || die "Street?"; 947 948my $plz = PLZ->new; 949 950my @args; 951push @args, "Agrep", $agrep; 952if ($grep_type) { 953 push @args, "GrepType", $grep_type; 954} 955if (!$extern) { 956 push @args, "Noextern", 1; 957} 958if (defined $citypart and $citypart ne "") { 959 push @args, "Citypart", $citypart; 960} 961if ($multi_citypart) { 962 push @args, "MultiCitypart", 1; 963} 964if ($multi_zip) { 965 push @args, "MultiZIP", 1; 966} 967 968my($res_ref, $errors) = $plz->look_loop(PLZ::split_street($street), @args); 969foreach my $res (@$res_ref) { 970 printf "%-40s %-20s %-10s (%s)\n", @$res; 971} 972print "*** Errors: $errors\n"; 973 974###################################################################### 975# Ein Kuriosum in Berlin: sowohl die Waldstr. in Gr�nau als auch die 976# Waldstr. in Schm�ckwitz haben die gleiche PLZ 12527. Erschwerend kommt 977# hinzu, dass Gr�nau (fr�her K�penick) und Schm�ckwitz (fr�her Treptow) 978# heute im gleichen Bezirk liegen. Siehe auch combine() f�r die derzeitige 979# L�sung des Problems. 980 981# Weiterer Fall: es gibt zweimal den Mittelweg, PLZ 12524, aber in 982# unterschiedlichen Stadtteilen im gleichen Bezirk: Altglienicke und 983# Bohnsdorf 984 985# Quick check: 986# perl -Ilib -MData::Dumper -MPLZ -e '$p=PLZ->new;warn Dumper $p->look_loop($ARGV[0], Max => 1, MultiZIP => 1, MultiCitypart => 1, Agrep => "default")' ... 987# 988# Convert to bbd: 989# perl -F'\|' -nale 'print "@F[0,1,2]\tX $F[3]" if $F[3]' Berlin.coords.data > /tmp/plz.bbd 990 991