1# -*- perl -*-
2
3#
4# Author: Slaven Rezic
5#
6# Copyright (C) 1998, 2000, 2001, 2002, 2003, 2004, 2010 Slaven Rezic. All rights reserved.
7# This package is free software; you can redistribute it and/or
8# modify it under the same terms as Perl itself.
9#
10# Mail: slaven@rezic.de
11# WWW:  http://bbbike.sourceforge.net
12#
13
14package PLZ;
15
16use 5.005; # qr{}
17
18use strict;
19# Setting $OLD_AGREP to a true value really means: use String::Approx
20# instead or no agrep at all.
21use vars qw($PLZ_BASE_FILE @plzfile $OLD_AGREP $VERSION $VERBOSE $sep);
22use locale;
23use BBBikeUtil;
24use Strassen::Strasse;
25
26$VERSION = 1.76;
27
28use constant FMT_NORMAL            => 0; # /usr/www/soc/plz/Berlin.data
29use constant FMT_REDUCED           => 1; # ./data/Berlin.small.data (does not exist anymore)
30use constant FMT_COORDS            => 2; # ./data/Berlin.coords.data
31use constant FMT_COORDS_WITH_INDEX => 3; # PLZ::Multi with -addindex option
32
33# agrep says that 32 is the max length, but experiments show something else:
34use constant AGREP_LONGEST_RX => 29;
35
36$PLZ_BASE_FILE = "Berlin.coords.data" if !defined $PLZ_BASE_FILE;
37
38# XXX use BBBikeUtil::bbbike_root().'/data'!!!
39@plzfile =
40  ((map { "$_/$PLZ_BASE_FILE" } @Strassen::datadirs),
41   BBBikeUtil::bbbike_root().'/data/'.$PLZ_BASE_FILE,
42   (map { ("$_/$PLZ_BASE_FILE", "$_/data/$PLZ_BASE_FILE") } @INC),
43   (map { ("$_/berlinco.dat",
44	   "$_/Berlin.data",        "$_/data/Berlin.data") } @INC),
45  ) if !@plzfile;
46$OLD_AGREP = 0 unless defined $OLD_AGREP;
47# on FreeBSD is
48#    ports/textproc/agrep => agrep 2.04 with buggy handling of umlauts
49#    ports/textproc/glimpse => agrep 3.0
50
51# indexes of file fields
52use constant FILE_NAME     => 0;
53use constant FILE_CITYPART => 1;
54use constant FILE_ZIP      => 2; # this is not valid for FMT_NORMAL
55use constant FILE_COORD    => 3; # the "identification" coordinate
56use constant FILE_INDEX    => 4;
57use constant FILE_STRTYPE  => 5; # This is a placeholder, and not implemented now!
58
59use constant FILE_ZIP_FMT_NORMAL => 4; # this is only valid for FMT_NORMAL
60
61$sep = '|';
62
63use constant SA_ANCHOR_LENGTH => 3; # use 0 to turn off String::Approx anchor hack
64use constant SA_ANCHOR_HACK   => "�" x SA_ANCHOR_LENGTH; # use a rare character
65
66sub new {
67    my($class, $file) = @_;
68    my $self = {};
69    if (!defined $file) {
70	foreach (@plzfile) {
71	    if (-r $_ && open(DATA, $_)) {
72		$file = $_;
73		$self->{IsGzip} = 0;
74	    } elsif (-r "$_.gz") {
75		if (is_in_path("gzip") && -d "/tmp" && -w "/tmp") {
76		    require File::Basename;
77		    my $dest = "/tmp/" . File::Basename::basename($_);
78		    system("gzip -dc $_ > $dest");
79		    if (open(DATA, $dest)) {
80			if ($?/256 == 0) {
81			    $file = $dest;
82			    $self->{WasGzip} = 1;
83			}
84		    } else {
85			warn "Cannot open $dest: $!";
86		    }
87		}
88		if (!defined $file) {
89		    warn "Gzip file $_.gz cannot be handled";
90		}
91	    }
92	    last if defined $file;
93	}
94    } elsif (defined $file) {
95	open(DATA, $file) or return undef;
96    } else {
97	return undef;
98    }
99
100    binmode DATA;
101    my($line) = <DATA>;
102    $line =~ s/[\015\012]//g;
103# Automatic detection of format. Caution: this means that the first line
104# in Berlin.coords.data must be complete i.e. having the coords field defined!
105    my(@l) = split(/\|/, $line);
106    if (@l == 3) {
107	$self->{DataFmt}  = FMT_REDUCED;
108	$self->{FieldPLZ} = FILE_ZIP;
109    } elsif (@l == 4) {
110	$self->{DataFmt}  = FMT_COORDS;
111	$self->{FieldPLZ} = FILE_ZIP;
112    } elsif (@l == 5) {
113	$self->{DataFmt}  = FMT_COORDS_WITH_INDEX;
114	$self->{FieldPLZ} = FILE_ZIP;
115    } else {
116	$self->{DataFmt} = FMT_NORMAL;
117	$self->{FieldPLZ} = FILE_ZIP_FMT_NORMAL;
118    }
119    close DATA;
120
121    $self->{File} = $file;
122    $self->{Sep} = '|'; # XXX not yet used
123    bless $self, $class;
124}
125
126# Load the data into $self->{Data}. Not necessary for nearly all other
127# methods.
128sub load {
129    my($self, %args) = @_;
130    my $file = $args{File} || $self->{File};
131    if (do { local $^W = 0; $file ne $self->{Data} }) { # XXX h�h???
132	my @data;
133	open(PLZ, $file)
134	  or die "Die Datei $file kann nicht ge�ffnet werden: $!";
135	binmode PLZ;
136
137	my $code = <<'EOF';
138	while(<PLZ>) {
139	    chomp;
140	    my(@l) = split(/\|/, $_);
141EOF
142	my $push_code;
143	if ($self->{DataFmt} == FMT_REDUCED) {
144	    $push_code = q{push @data,
145			   [@l[FILE_NAME, FILE_CITYPART, FILE_ZIP]]};
146	} elsif ($self->{DataFmt} == FMT_COORDS) {
147	    $push_code = q{push @data,
148			   [@l[FILE_NAME, FILE_CITYPART, FILE_ZIP, FILE_COORD]]};
149	} elsif ($self->{DataFmt} == FMT_COORDS_WITH_INDEX) {
150	    $push_code = q{push @data,
151			   [@l[FILE_NAME, FILE_CITYPART, FILE_ZIP, FILE_COORD, FILE_INDEX]]};
152	} else {
153	    $push_code = q{push @data,
154			   [@l[FILE_NAME, FILE_CITYPART, FILE_ZIP_FMT_NORMAL]]};
155	}
156	$code .= $push_code . <<'EOF';
157	}
158EOF
159        eval $code;
160	close PLZ;
161	$self->{Data} = \@data;
162	$self->{File} = $file;
163	undef $self->{NameHash};
164	undef $self->{PlzHash};
165    }
166}
167
168sub make_plz_re {
169    my($self, $plz) = @_;
170    if ($self->{DataFmt} == FMT_REDUCED ||
171	$self->{DataFmt} == FMT_COORDS ||
172	$self->{DataFmt} == FMT_COORDS_WITH_INDEX) {
173	'^[^|]*|[^|]*|' . $plz;
174    } else {
175	'^[^|]*|[^|]*|[^|]*|[^|]*|' . $plz . '|';
176    }
177}
178
179# indexes of return values
180use constant LOOK_NAME     => 0;
181use constant LOOK_CITYPART => 1;
182use constant LOOK_ZIP      => 2;
183use constant LOOK_COORD    => 3;
184use constant LOOK_INDEX    => 4;
185use constant LOOK_STRTYPE  => 5; # This is a placeholder, and not implemented now!
186
187# XXX make gzip-aware
188# Argumente: (Beschreibung fehlt XXX)
189#  Agrep/GrepType
190#  Noextern
191#  NoStringApprox
192#  Citypart (optionale Einschr�nkung auf einen Bezirk oder Postleitzahl,
193#            may also be an array reference to a number of cityparts)
194#  MultiCitypart - empfehlenswert, wenn Citypart eine Postleitzahl ist!
195#  MultiZIP
196# Ausgabe: Array von Referenzen [strasse, bezirk, plz, "x,y-Koordinate"]
197#  Je nach Format der Quelldatei ($self->{DataFmt}) fehlt die x,y-Koordinate
198sub look {
199    my($self, $str, %args) = @_;
200
201    my $file = $args{File} || $self->{File};
202    my %valid_cityparts;
203    if (defined $args{Citypart} && length $args{Citypart}) {
204	%valid_cityparts = map { (lc $_,1) } ref $args{Citypart} eq 'ARRAY' ? @{ $args{Citypart} } : $args{Citypart};
205    }
206
207    my @res;
208
209    # Windows usually does not have grep and agrep externally
210    if ($^O eq 'MSWin32' && !exists $args{Noextern}) {
211	$args{Noextern} = 1;
212    }
213
214    print STDERR "->look($str, " . join(" ", %args) .") in '$file'\n" if $VERBOSE;
215
216    #XXX use fgrep instead of grep? slightly faster, no quoting needed!
217    my $grep_type = ($args{Agrep} ? 'agrep' : ($args{GrepType} || 'grep'));
218    my @push_inx;
219    if      ($self->{DataFmt} == FMT_NORMAL) {
220	@push_inx = (FILE_NAME, FILE_CITYPART, $self->{FieldPLZ});
221    } elsif ($self->{DataFmt} == FMT_REDUCED) {
222	@push_inx = (FILE_NAME, FILE_CITYPART, FILE_ZIP);
223    } elsif ($self->{DataFmt} == FMT_COORDS_WITH_INDEX) {
224	@push_inx = (FILE_NAME, FILE_CITYPART, FILE_ZIP, FILE_COORD, FILE_INDEX);
225    } else {
226	@push_inx = (FILE_NAME, FILE_CITYPART, FILE_ZIP, FILE_COORD);
227    }
228    if ($grep_type eq 'agrep') {
229	if ($OLD_AGREP ||
230	    (!$args{Noextern} && !is_in_path('agrep')) ||
231	    length($str) > AGREP_LONGEST_RX # otherwise there are "pattern too long" errors
232	    # XXX AGREP_LONGEST_RX is not perfect --- the string is rx-escaped, see below
233	   ) {
234	    $args{Noextern} = 1;
235	}
236	if ($args{Noextern}) {
237	    eval q{local $SIG{'__DIE__'};
238		   die "Won't use String::Approx" if $args{NoStringApprox};
239		   require String::Approx;
240		   String::Approx->VERSION(2.7);
241	       };
242	    if ($@) {
243		if ($args{Agrep} == 1) {
244		    $grep_type = 'grep-umlaut';
245		} else {
246		    $grep_type = 'grep';
247		}
248	    }
249	}
250    }
251    if ($grep_type eq 'grep') {
252	if (!$args{Noextern} && !is_in_path('grep')) {
253	    $args{Noextern} = 1;
254	}
255    }
256
257    my %res;
258    my $push_sub = sub {
259	my(@to_push) = (split(/\|/, $_[FILE_NAME]))[@push_inx];
260	if (($args{MultiCitypart}||
261	     $to_push[FILE_CITYPART] eq "" ||
262	     !exists $res{$to_push[FILE_NAME]}->{$to_push[FILE_CITYPART]}) &&
263	    ($args{MultiZIP}     ||
264	     $to_push[FILE_ZIP] eq "" ||
265	     !exists $res{$to_push[FILE_NAME]}->{$to_push[FILE_ZIP]})
266	   ) {
267	    # filter by citypart (Bezirk) or ZIP
268	    return if (keys %valid_cityparts &&
269		       !($valid_cityparts{lc $to_push[FILE_CITYPART]} ||
270			 $valid_cityparts{$to_push[FILE_ZIP]})
271		      );
272
273	    push @res, [@to_push];
274	    return if defined $args{Max} and $args{Max} < $#res;
275	    $res{$to_push[FILE_NAME]}->{$to_push[FILE_CITYPART]}++;
276	    $res{$to_push[FILE_NAME]}->{$to_push[FILE_ZIP]}++;
277	}
278    };
279
280    if (!$args{Noextern} && $grep_type =~ /^a?grep$/) {
281	unless ($args{Noquote}) {
282	    if ($grep_type eq 'grep') {
283		# XXX quotemeta verwenden?
284		$str =~ s/([\\.*\[\]])/\\$1/g; # quote metacharacters
285	    } else { # agrep
286		$str =~ s/([\$\^\*\[\]\^\|\(\)\!\`\,\;])/\\$1/g;
287	    }
288	    $str = "^$str";
289	}
290
291	# limitation of agrep:
292	if ($grep_type eq 'agrep' && length($str) > AGREP_LONGEST_RX) {
293	    $str = substr($str, 0, AGREP_LONGEST_RX);
294	    $str =~ s/\\$//; # remove a (lonely?) backslash at the end
295	    # XXX but this will be wrong if it's really a \\
296	}
297
298	if (eval { require Encode; Encode::is_utf8($str) }) {
299	    $str = Encode::encode("iso-8859-1", $str);
300	}
301	my(@grep_args) = ('-i', $str, $file);
302	if ($grep_type eq 'agrep' && $args{Agrep}) {
303	    unshift @grep_args, "-$args{Agrep}";
304	}
305	my @cmd = ($grep_type, @grep_args);
306	warn "About to call <@cmd>" if $VERBOSE;
307	CORE::open(PLZ, "-|") or do {
308	    $ENV{LANG} = $ENV{LC_ALL} = $ENV{LC_CTYPE} = 'C';
309	    # agrep emits some warnings "using working-directory '...'
310	    # to locate dictionaries" if it does not have a $ENV{HOME}
311	    # (which is probably a bug, because dictionaries are not
312	    # used at all)
313	    $ENV{HOME} = "/something";
314	    exec @cmd;
315	    warn "While doing @cmd: $!";
316	    require POSIX;
317	    POSIX::_exit(1); # avoid running any END blocks
318	};
319	my %res;
320	binmode PLZ;
321	while(<PLZ>) {
322	    chomp;
323	    $push_sub->($_);
324	}
325	close PLZ;
326    } else {
327	CORE::open(PLZ, $file)
328	  or die "Die Datei $file kann nicht ge�ffnet werden: $!";
329	binmode PLZ;
330	if ($grep_type eq 'agrep') {
331	    chomp(my @data = <PLZ>);
332	    close PLZ;
333	    my %res;
334	    if (@data) {
335		foreach (map { substr $_, SA_ANCHOR_LENGTH }
336			 String::Approx::amatch(SA_ANCHOR_HACK . $str,
337						['i', $args{Agrep}],
338						map { SA_ANCHOR_HACK . $_ } @data)) {
339		    $push_sub->($_);
340		}
341	    }
342	} elsif ($grep_type =~ m{^grep-(umlaut|inword|substr)$}) {
343	    my $sub_type = $1;
344	    if ($sub_type eq 'umlaut') {
345		$str = '(?i:^' . quotemeta(BBBikeUtil::umlauts_to_german($str)) . ')';
346	    } elsif ($sub_type eq 'inword') {
347		$str = '(?i:\b' . quotemeta(BBBikeUtil::umlauts_to_german($str)) . '\b)';
348	    } elsif ($sub_type eq 'substr') {
349		$str = '(?i:' . quotemeta(BBBikeUtil::umlauts_to_german($str)) . ')';
350	    }
351	    $str = qr{$str};
352	    while(<PLZ>) {
353		chomp;
354		if (BBBikeUtil::umlauts_to_german($_) =~ $str) {
355		    $push_sub->($_);
356		}
357	    }
358	    close PLZ;
359	} else {
360	    $str = quotemeta($str) unless $args{Noquote};
361	    $str = "^$str" unless $args{Noquote};
362#XXX del?	    $str =~ s/\|/\\|/g;
363	    $str = '(?i:' . $str . ')';
364	    $str = qr{$str};
365	    my %res;
366	    while(<PLZ>) {
367		chomp;
368		if ($_ =~ $str) {
369		    $push_sub->($_);
370		}
371	    }
372	    close PLZ;
373	}
374    }
375
376    @res;
377}
378
379# Argument: an array of references (the output of look())
380# Combine records which form the same street (though same identification coordinate)
381# Returned value has the same format as the input
382#
383# Historical note: before 2010-07 this function did also guesses by
384# checking same citypart and/or same zip code. Unfortunately there are
385# actually two pairs of same-named streets in Berlin (Schoenhauser
386# Str. and Waldstr.) which have the same zip code though being
387# different. Previously Berlin.coords.data did not use the coordinate
388# as an id.
389sub combine {
390    my($self, @in) = @_;
391    my %out;
392    my @copy_indexes = (LOOK_NAME, LOOK_COORD);
393    if ($self->{DataFmt} eq FMT_COORDS_WITH_INDEX) {
394	push @copy_indexes, LOOK_INDEX;
395    }
396 CHECK_IT:
397    foreach my $s (@in) {
398	if (exists $out{$s->[LOOK_NAME]}) {
399	    foreach my $r (@{ $out{$s->[LOOK_NAME]} }) {
400		my $eq_coord = $s->[LOOK_COORD] && $s->[LOOK_COORD] eq $r->[LOOK_COORD];
401		if ($eq_coord) {
402		    my $eq_cp = grep { $s->[LOOK_CITYPART] eq $_ } grep { $_ ne "" } @{ $r->[LOOK_CITYPART] };
403		    my $eq_zp = grep { $s->[LOOK_ZIP]      eq $_ } grep { $_ ne "" } @{ $r->[LOOK_ZIP] };
404		    push @{ $r->[LOOK_CITYPART] }, $s->[LOOK_CITYPART]
405			unless $eq_cp;
406		    push @{ $r->[LOOK_ZIP] }, $s->[LOOK_ZIP]
407			unless $eq_zp;
408		    next CHECK_IT;
409		}
410	    }
411	}
412	# does not exist or is a new citypart/zip combination
413	my $r = [];
414	$r->[$_] = $s->[$_] for (@copy_indexes);
415	$r->[LOOK_CITYPART] = [ $s->[LOOK_CITYPART] ];
416	$r->[LOOK_ZIP] = [ $s->[LOOK_ZIP ] ];
417	push @{ $out{$s->[LOOK_NAME]} }, $r;
418    }
419    map { @$_ } values %out;
420}
421
422# converts an array element from combine from
423#    ["Hauptstr.", ["Friedenau","Schoeneberg],[10827,12159], $coord]
424# to
425#    ["Hauptstr.", "Friedenau, Schoeneberg", "10827,12159", $coord]
426sub combined_elem_to_string_form {
427    my($self, $elem) = @_;
428    my @copy_indexes = (LOOK_NAME, LOOK_COORD);
429    if ($self->{DataFmt} eq FMT_COORDS_WITH_INDEX) {
430	push @copy_indexes, LOOK_INDEX;
431    }
432    my $r = [];
433    $r->[$_] = $elem->[$_] for (@copy_indexes);
434    $r->[LOOK_CITYPART] = join(", ", @{$elem->[LOOK_CITYPART]});
435    $r->[LOOK_ZIP]      = join(", ", @{$elem->[LOOK_ZIP]});
436    $r;
437}
438
439# Split a street specification like "Heerstr. (Charlottenburg, Spandau)"
440# to the street component and the citypart components
441sub split_street {
442    my $street = shift;
443    my @cityparts;
444    ($street, @cityparts) = Strasse::split_street_citypart($street);
445    if (@cityparts) {
446	($street, Citypart => \@cityparts);
447    } else {
448	($street);
449    }
450}
451
452# Match-Reihenfolge:
453# * nicht modifiziert ohne Agrep
454# * "strasse" nach "str." umgewandelt ohne Agrep
455# * inkrementell bis $args{Agrep} mit Agrep abwechselnd nicht modifiziert und
456#   mit s/strasse/str./
457# Argumente in %args:
458#   Agrep: 0, wenn grep verwendet werden soll
459#          >0, wenn mit Fehlern gesucht werden, dann gibt der Wert die
460#              maximale Anzahl der erlaubten Fehler an
461#          'default', wenn der Standardwert von 3 Fehlern genommen werden soll
462#                     Bei l�ngeren W�rtern wird die Maximalanzahl bis 5 erh�ht.
463# Sonstige Argumente werden nach look() durchgereicht.
464# Ausgabe:
465#   erstes Element: siehe look() (als Arrayreferenz)
466#   zweites Element: Anzahl der Fehler f�r das Ergebnis
467# Wenn $args{LookCompat} gesetzt ist, dann ist die Ausgabe genau wie bei
468# look().
469sub look_loop {
470    my($self, $str, %args) = @_;
471    my $max_agrep;
472    if (defined $args{Agrep} && $args{Agrep} eq 'default') {
473	$max_agrep = 3;
474	# Allow more errors for longer strings:
475	if    (length($str) > 15) { $max_agrep = 4 }
476	elsif (length($str) > 25) { $max_agrep = 5 }
477	delete $args{Agrep};
478    } else {
479	$max_agrep = delete $args{Agrep} || 0;
480    }
481
482    my $agrep = 0;
483    my @matchref;
484    # 1. Try unaltered
485    @matchref = $self->look($str, %args);
486    if (!@matchref) {
487	# 2. Try to strip house number
488	if (my $str0 = _strip_hnr($str)) {
489	    @matchref = $self->look($str0, %args);
490	}
491	# 3. Try to strip "stra�e" => "str."
492	# 3b. Strip house number
493	if (!@matchref) {
494	    if (my $str0 = _strip_strasse($str)) {
495		@matchref = $self->look($str0, %args);
496		if (!@matchref) {
497		    if ($str0 = _strip_hnr($str0)) {
498			@matchref = $self->look($str0, %args);
499		    }
500		}
501	    }
502	}
503	# 4. Try to expand "Str." on beginning of the string
504	# 4b. Strip house number
505	if (!@matchref) {
506	    if (my $str0 = _expand_strasse($str)) {
507		@matchref = $self->look($str0, %args);
508		if (!@matchref) {
509		    if ($str0 = _strip_hnr($str0)) {
510			@matchref = $self->look($str0, %args);
511		    }
512		}
513	    }
514	}
515	# 5. Try word match in the middle of the string
516	if (!@matchref && length $str >= 4) {
517	    my %args = %args;
518	    delete $args{Agrep};
519	    $args{GrepType} = "grep-inword";
520	    @matchref = $self->look($str, %args);
521	}
522	# 6. Use increasing approximate match. Try first unaltered, then
523	#    with stripped street, then without house number.
524	if (!@matchref) {
525	    $agrep = 1;
526	    while ($agrep <= $max_agrep) {
527		@matchref = $self->look($str, %args, Agrep => $agrep);
528		if (!@matchref && (my $str0 = _strip_strasse($str))) {
529		    @matchref = $self->look($str0, %args, Agrep => $agrep);
530		}
531		if (!@matchref && (my $str0 = _strip_hnr($str))) {
532		    @matchref = $self->look($str0, %args, Agrep => $agrep);
533		}
534		{
535		    my $str0;
536		    if (!@matchref
537			&& ($str0 = _strip_strasse($str))) {
538			@matchref = $self->look($str0, %args, Agrep => $agrep);
539			if (!@matchref
540			    && ($str0 = _strip_hnr($str0))) {
541			    @matchref = $self->look($str0, %args, Agrep => $agrep);
542			}
543		    }
544		}
545		{
546		    my $str0;
547		    if (!@matchref
548			&& ($str0 = _expand_strasse($str))) {
549			@matchref = $self->look($str0, %args, Agrep => $agrep);
550			if (!@matchref
551			    && ($str0 = _strip_hnr($str0))) {
552			    @matchref = $self->look($str0, %args, Agrep => $agrep);
553			}
554		    }
555		}
556		last if @matchref;
557		$agrep++;
558	    }
559	}
560    }
561    if ($args{LookCompat}) {
562	@matchref;
563    } else {
564	(\@matchref, $agrep);
565    }
566}
567
568sub _strip_strasse {
569    my $str = shift;
570    if ($str =~ /stra(?:ss|�)e/i) {
571	$str =~ s/(s)tra(?:ss|�)e/$1tr./i;
572	$str;
573    } else {
574	undef;
575    }
576}
577
578sub _strip_hnr {
579    my $str = shift;
580    # This strips input like "Stra�e 1a" or "Stra�e 1-2". Maybe
581    # also strip "Stra�e 1 a"? XXX
582    if ($str =~ m{\s+(?:\d+[a-z]?|\d+\s*[-/]\s*\d+)\s*$}) {
583	$str =~ s{\s+(?:\d+[a-z]?|\d+\s*[-/]\s*\d+)\s*$}{};
584	$str;
585    } else {
586	undef;
587    }
588}
589
590sub _expand_strasse {
591    my $str = shift;
592    my $replaced = 0;
593    if      ($str =~ s/^(U\+S|S\+U)[- ](?:Bahnhof|Bhf\.?)\s+/S-Bhf /i) { # Choose one
594	$replaced++;
595    } elsif ($str =~ s/^(U\+S|S\+U)\s+/S-Bhf /i) { # Choose one
596	$replaced++;
597    } elsif ($str =~ s/^([US])[- ](?:Bahnhof|Bhf\.?)\s+/uc($1)."-Bhf "/ie) {
598	$replaced++;
599    } elsif ($str =~ s/^([US])Bhf\.?\s+/uc($1)."-Bhf "/ie) { # without space or dash...
600	$replaced++;
601    } elsif ($str =~ s/^([US])\s+/uc($1)."-Bhf "/ie) {
602	$replaced++;
603    }
604    if      ($str =~ s/^(k)l\.?\s+(.*str)/$1leine $2/i) {
605	$replaced++;
606    } elsif ($str =~ s/^(g)r\.?\s+(.*str)/$1ro�e $2/i) {
607	$replaced++;
608    }
609    if ($str =~ /^\s*str\.(\S)?/i) {
610	if (defined $1) {	# add space
611	    $str =~ s/^\s*(s)tr\./$1tra�e /i;
612	} else {
613	    $str =~ s/^\s*(s)tr\./$1tra�e/i;
614	}
615	$replaced++;
616	$str;
617    } elsif ($str =~ s/^\s*(s)trasse/$1tra�e/i) {
618	$replaced++;
619	$str;
620    } elsif ($replaced) {
621	$str;
622    } else {
623	undef;
624    }
625}
626
627# Sortiert die Stra�en eines look_loop-Ergebnisses.
628# Argumente und R�ckgabewerte sind vom gleichen Format wie bei look_loop.
629sub look_loop_best {
630    my($self, $str, %args) = @_;
631    my $look_compat = delete $args{LookCompat};
632    my($matchref, $agrep) = $self->look_loop($str, %args);
633    if (@$matchref) {
634	my @rating;
635	my $str_rx = qr{(?i:^\Q$str\E)};
636	for(my $i=0; $i<=$#$matchref; $i++) {
637	    my $item = $matchref->[$i];
638	    if ($item->[LOOK_NAME] eq $str) {
639		push @rating, [ 100, $item ];
640	    } elsif ($item->[LOOK_NAME] =~ $str_rx) {
641		push @rating, [ 40 + 40-length($item->[LOOK_NAME]), $item ];
642	    } else {
643		push @rating, [ 40-length($item->[LOOK_NAME]), $item ];
644	    }
645	}
646	$matchref = [map  { $_->[1] } sort { $b->[0] <=> $a->[0] } @rating];
647    }
648    if ($look_compat) {
649	@$matchref;
650    } else {
651	($matchref, $agrep);
652    }
653}
654
655# In: an array of indexes FILE_...
656# Out: a hashref $hash->{VAL_INDEX_1}{VAL_INDEX_2}{...} = [$pos1, $pos2, ...]
657sub make_any_hash {
658    my($self, @indexes) = @_;
659    die "Please call the load() method first" if !$self->{Data};
660    my %hash;
661    my $i = 0;
662    foreach my $datarec (@{$self->{Data}}) {
663	my $h = \%hash;
664	for(my $index_i = 0; $index_i <= $#indexes; $index_i++) {
665	    my $field_val = $datarec->[$indexes[$index_i]];
666	    if ($index_i == $#indexes) {
667		push @{$h->{$field_val}}, $i;
668	    } else {
669		$h = $h->{$field_val} ||= {};
670	    }
671	}
672	$i++;
673    }
674    \%hash;
675}
676
677sub as_streets {
678    my $self = shift;
679    my(%args) = @_;
680    my $cat = $args{Cat} || 'X';
681
682    my @data;
683
684    if ($self->{DataFmt} ne FMT_COORDS) {
685	die "Only PLZ format FMT_COORDS (".FMT_COORDS.") is supported, not " . $self->{DataFmt};
686    }
687    CORE::open(F, $self->{File}) or die "Can't open $self->{File}: $!";
688    binmode F;
689    while(<F>) {
690	chomp;
691	my(@f) = split /\|/;
692	push @data, $f[FILE_NAME]." (".$f[FILE_CITYPART].", ".$f[FILE_ZIP].")\t$cat ".$f[FILE_COORD]."\n"
693	    if defined $f[FILE_COORD] && $f[FILE_COORD] ne '';
694    }
695    close F;
696
697    require Strassen;
698    my $s = Strassen->new_from_data_ref(\@data);
699    $s->{File} = $self->{File};
700    $s;
701}
702
703# convert Strassen.pm object to PLZ.pm data file
704# my $new_data = PLZ->new_data_from_streets(new Strassen ...);
705sub new_data_from_streets {
706    my($class, $s) = @_;
707    my $ret = "";
708    $s->init;
709    while(1) {
710	my $r = $s->next;
711	last if !@{ $r->[Strassen::COORDS()] };
712	my($street, %args) = split_street($r->[Strassen::NAME()]);
713	$ret .= "$street$sep";
714	if ($args{Citypart}) {
715	    $ret .= join(", ", @{ $args{Citypart} });
716	}
717	$ret .= "$sep$sep";
718	$ret .= $r->[Strassen::COORDS()][$#{$r->[Strassen::COORDS()]}/2];
719	$ret .= "\n";
720    }
721    $ret;
722}
723
724sub zip_to_cityparts_hash {
725    my($self, %args) = @_;
726    my $cachebase;
727    my $h;
728    if ($args{UseCache}) {
729	require Strassen::Util;
730	require File::Basename;
731	$cachebase = "zip_to_cityparts_" . File::Basename::basename($self->{File});
732	$h = Strassen::Util::get_from_cache($cachebase, [$self->{File}]);
733	if ($h) {
734	    warn "Using cache for $cachebase\n" if $VERBOSE;
735	    return $h;
736	}
737    }
738
739    my $hh;
740    open(PLZ, $self->{File})
741	or die "Die Datei $self->{File} kann nicht ge�ffnet werden: $!";
742    binmode PLZ;
743    while(<PLZ>) {
744	chomp;
745	my(@l) = split(/\|/, $_);
746	if ($l[FILE_ZIP] ne "" && $l[FILE_CITYPART] ne "") {
747	    $hh->{$l[FILE_ZIP]}{$l[FILE_CITYPART]}++;
748	}
749    }
750    close PLZ;
751
752    while(my($k,$v) = each %$hh) {
753	$h->{$k} = [keys %$v];
754    }
755
756    if (defined $cachebase) {
757	Strassen::Util::write_cache($h, $cachebase);
758	warn "Wrote cache ($cachebase)\n" if $VERBOSE;
759    }
760    $h;
761}
762
763sub norm_street {
764    my $str = shift;
765    $str =~ s/(s)tra(?:ss|�)e$/$1tr\./i; # XXX more?
766    $str =~ s/^\s+//;
767    $str =~ s/\s+$//;
768    $str =~ s/ +/ /g;
769    $str;
770}
771
772sub streets_hash {
773    my $self = shift;
774    my %hash;
775    open(D, $self->{File}) or die "Can't open $self->{File}: $!";
776    binmode D;
777    my $pos = tell(D);
778    while(<D>) {
779	chomp;
780	/^(.+?)\|/;
781	my $l = $1;
782	if (!exists $hash{$l}) {
783	    $hash{$l} = $pos;
784	}
785	$pos = tell(D);
786    }
787    close D;
788    \%hash;
789}
790
791sub street_words_hash {
792    my $self = shift;
793    my %hash;
794    open(D, $self->{File}) or die "Can't open $self->{File}: $!";
795    binmode D;
796    my $pos = tell(D);
797    while(<D>) {
798	chomp;
799	/^(.+?)\|/;
800	my @s = split /\s+/, $1;
801	my $h = \%hash;
802	for my $i (0 .. $#s) {
803	    if (!exists $h->{$s[$i]}) {
804		if ($i == $#s) {
805		    $h->{$s[$i]} = $pos;
806		} else {
807		    $h->{$s[$i]} = {};
808		    $h = $h->{$s[$i]};
809		}
810	    } else {
811		my $old_h = $h->{$s[$i]};
812		if (!UNIVERSAL::isa($old_h, 'HASH')) {
813		    $h->{$s[$i]} = {"" => $old_h};
814		    $old_h = $h->{$s[$i]};
815		}
816		if ($i == $#s) {
817		    if (!exists $old_h->{""}) {
818			$old_h->{""} = $pos;
819		    }
820		} else {
821		    $h = $h->{$s[$i]};
822		}
823	    }
824	}
825	$pos = tell(D);
826    }
827    close D;
828    \%hash;
829}
830
831# Arguments:
832#   $text: string to examine
833#   $h: result of street_words_hash
834# XXX still a simple-minded solution
835sub find_streets_in_text {
836    my($self, $text, $h) = @_;
837    $h = $self->{StreetWordsHash} if !$h;
838    my @res;
839    my @s = split /(\s+)/, $text;
840    my $begin = 0;
841    my $length;
842    for(my $i = 0; $i <= $#s; $i+=2) {
843	$length = length($s[$i]);
844	if ($s[$i] =~ /^(s)tra(?:ss|�)e$/i) {
845	    $s[$i] = "$1tr.";
846	}
847	my $ii = 0;
848	if (exists $h->{$s[$i]}) {
849	    my $s = $s[$i];
850	    my $hh = $h->{$s[$i]};
851	    while (1) {
852		if (!UNIVERSAL::isa($hh, 'HASH')) {
853		    push @res, [$s, $begin, $length];
854		    last;
855		}
856		if (!exists $hh->{$s[$i+$ii+2]}) {
857		    if (exists $hh->{""}) {
858			push @res, [$s, $begin, $length];
859		    }
860		    last;
861		}
862		$ii+=2;
863		$s .= " $s[$i+$ii]";
864		$length += length($s[$i+$ii-1]) + length($s[$i+$ii]);
865		$hh = $hh->{$s[$i+$ii]};
866	    }
867	}
868
869	$i += $ii;
870	$begin += $length;
871	if (defined $s[$i+1]) {
872	    $begin += length($s[$i+1]);
873	}
874    }
875    \@res;
876}
877
878sub get_street_type {
879    my($self, $look_result) = @_;
880    if (defined $look_result->[LOOK_STRTYPE]) { # This is not yet defined, but maybe some day?
881	$look_result->[LOOK_STRTYPE];
882    } else {
883	my $name = $look_result->[LOOK_NAME];
884	if      ($name =~ m{(^Kolonie\s
885			    |^KGA\s
886			    |\s\(Kolonie\)$
887			    )}x) {
888	    return 'orchard';
889	} elsif ($name =~ m{^[SU]-Bhf\.?\s}) {
890	    return 'railway station';
891	} elsif ($name =~ m{\s\(Park\)$}) {
892	    return 'park';
893	} elsif ($name =~ m{\s\(Gastst�tte\)$}) {
894	    return 'restaurant';
895	} elsif ($name =~ m{\s\(Siedlung\)$}) {
896	    return 'settlement'; # XXX English wording?
897	} elsif ($name =~ m{(?:^Insel\s|\s\(Insel\)$)}) {
898	    return 'island';
899	} elsif ($name =~ m{\s\(geplant\)$}) {
900	    return 'projected street';
901	} else {
902	    return 'street';
903	}
904    }
905}
906
907# This method may be removed or renamed one day!
908sub _populate_street_type {
909    my($self, $look_result) = @_;
910    my $type = $self->get_street_type($look_result);
911    $look_result->[LOOK_STRTYPE] = $type;
912}
913
914return 1 if caller();
915
916######################################################################
917#
918# standalone program
919#
920package main;
921require Getopt::Long;
922
923my $agrep = "default";
924my $extern = 1;
925my $citypart;
926my $multi_citypart = 0;
927my $multi_zip = 0;
928my $grep_type;
929
930if (!Getopt::Long::GetOptions
931    ("agrep=i" => \$agrep,
932     "extern!" => \$extern,
933     "citypart=s" => \$citypart,
934     "multicitypart!" => \$multi_citypart,
935     "multizip!" => \$multi_zip,
936     "greptype=s" => \$grep_type,
937     "v!" => \$PLZ::VERBOSE,
938     )
939   ) {
940    die "Usage: $0 [-v] [-agrep errors] [-greptype grep-inword|grep-umlaut|...]
941	      [-extern] [-citypart citypart]
942              [-multicitypart] [-multizip] street
943";
944}
945
946my $street = shift || die "Street?";
947
948my $plz = PLZ->new;
949
950my @args;
951push @args, "Agrep", $agrep;
952if ($grep_type) {
953    push @args, "GrepType", $grep_type;
954}
955if (!$extern) {
956    push @args, "Noextern", 1;
957}
958if (defined $citypart and $citypart ne "") {
959    push @args, "Citypart", $citypart;
960}
961if ($multi_citypart) {
962    push @args, "MultiCitypart", 1;
963}
964if ($multi_zip) {
965    push @args, "MultiZIP", 1;
966}
967
968my($res_ref, $errors) = $plz->look_loop(PLZ::split_street($street), @args);
969foreach my $res (@$res_ref) {
970    printf "%-40s %-20s %-10s (%s)\n", @$res;
971}
972print "*** Errors: $errors\n";
973
974######################################################################
975# Ein Kuriosum in Berlin: sowohl die Waldstr. in Gr�nau als auch die
976# Waldstr. in Schm�ckwitz haben die gleiche PLZ 12527. Erschwerend kommt
977# hinzu, dass Gr�nau (fr�her K�penick) und Schm�ckwitz (fr�her Treptow)
978# heute im gleichen Bezirk liegen. Siehe auch combine() f�r die derzeitige
979# L�sung des Problems.
980
981# Weiterer Fall: es gibt zweimal den Mittelweg, PLZ 12524, aber in
982# unterschiedlichen Stadtteilen im gleichen Bezirk: Altglienicke und
983# Bohnsdorf
984
985# Quick check:
986# perl -Ilib -MData::Dumper -MPLZ -e '$p=PLZ->new;warn Dumper $p->look_loop($ARGV[0], Max => 1, MultiZIP => 1, MultiCitypart => 1, Agrep => "default")' ...
987#
988# Convert to bbd:
989# perl -F'\|' -nale 'print "@F[0,1,2]\tX $F[3]" if $F[3]' Berlin.coords.data > /tmp/plz.bbd
990
991