1# soundex.tcl --
2#
3#	Implementation of soundex in Tcl
4#
5# Copyright (c) 2003 by Andreas Kupries <andreas_kupries@users.sourceforge.net>
6#
7# See the file "license.terms" for information on usage and redistribution
8# of this file, and for a DISCLAIMER OF ALL WARRANTIES.
9#
10# RCS: @(#) $Id: soundex.tcl,v 1.3 2004/01/15 06:36:14 andreas_kupries Exp $
11
12package require Tcl 8.2
13
14namespace eval ::soundex {}
15
16## ------------------------------------------------------------
17##
18## I. Soundex by Knuth.
19
20# This implementation of the Soundex algorithm is released to the public
21# domain: anyone may use it for any purpose.  See if I care.
22
23# N. Dean Pentcheff 1/13/89 Dept. of Zoology University of California Berkeley,
24#    CA  94720 dean@violet.berkeley.edu
25# TCL port by Evan Rempel 2/10/98 Dept Comp Services University of Victoria.
26# erempel@uvic.ca
27
28# proc ::soundex::knuth ( string )
29#
30#   Given as argument: a character string. Returns: a static string, 4 characters long
31#   This string is the Soundex key for the argument string.
32#   Side effects and limitations:
33#   Does not clobber the string passed in as the argument. No limit on
34#   argument string length. Assumes a character set with continuously
35#   ascending and contiguous letters within each case and within the digits
36#   (e.g. this works for ASCII and bombs in EBCDIC. But then, most things
37#   do.). Reference: Adapted from Knuth, D.E. (1973) The art of computer
38#   programming; Volume 3: Sorting and searching.  Addison-Wesley Publishing
39#   Company: Reading, Mass. Page 392.
40#   Special cases: Leading or embedded spaces, numerals, or punctuation are squeezed
41#   out before encoding begins.
42#
43#   Null strings or those with no encodable letters return the code 'Z000'.
44#
45#   Test data from Knuth (1973):
46#   Euler   Gauss   Hilbert Knuth   Lloyd   Lukasiewicz
47#   E460    G200    H416    K530    L300    L222
48
49namespace eval ::soundex {
50    variable  soundexKnuthCode
51    array set soundexKnuthCode {
52	a 0 b 1 c 2 d 3 e 0 f 1 g 2 h 0 i 0 j 2 k 2 l 4 m 5
53	n 5 o 0 p 1 q 2 r 6 s 2 t 3 u 0 v 1 w 0 x 2 y 0 z 2
54    }
55}
56proc ::soundex::knuth {in} {
57    variable soundexKnuthCode
58    set key ""
59
60    # Remove the leading/trailing white space punctuation etc.
61
62    set TempIn [string trim $in "\t\n\r .,'-"]
63
64    # Only use alphabetic characters, so strip out all others
65    # also, soundex index uses only lower case chars, so force to lower
66
67    regsub -all {[^a-z]} [string tolower $TempIn] {} TempIn
68    if {[string length $TempIn] == 0} {
69	return Z000
70    }
71    set last [string index $TempIn 0]
72    set key  [string toupper $last]
73    set last $soundexKnuthCode($last)
74
75    # Scan rest of string, stop at end of string or when the key is
76    # full
77
78    set count    1
79    set MaxIndex [string length $TempIn]
80
81    for {set index 1} {(($count < 4) && ($index < $MaxIndex))} {incr index } {
82	set chcode $soundexKnuthCode([string index $TempIn $index])
83	# Fold together adjacent letters sharing the same code
84	if {![string equal $last $chcode]} {
85	    set last $chcode
86	    # Ignore code==0 letters except as separators
87	    if {$last != 0} then {
88		set key $key$last
89		incr count
90	    }
91	}
92    }
93    return [string range ${key}0000 0 3]
94}
95
96package provide soundex 1.0
97