1module Utils.Match exposing (consecutiveChars, jaroWinkler)
2
3import Char
4import Utils.List exposing (zip)
5
6
7{-|
8
9    Adapted from https://blog.art-of-coding.eu/comparing-strings-with-metrics-in-haskell/
10
11-}
12jaro : String -> String -> Float
13jaro s1 s2 =
14    if s1 == s2 then
15        1.0
16
17    else
18        let
19            l1 =
20                String.length s1
21
22            l2 =
23                String.length s2
24
25            z2 =
26                zip (List.range 1 l2) (String.toList s2)
27                    |> List.map (Tuple.mapSecond Char.toCode)
28
29            searchLength =
30                -- A character must be within searchLength spaces of the
31                -- character we are matching against in order to be considered
32                -- a match.
33                -- (//) is integer division, which removes the need to floor
34                -- the result.
35                (max l1 l2 // 2) - 1
36
37            m =
38                zip (List.range 1 l1) (String.toList s1)
39                    |> List.map (Tuple.mapSecond Char.toCode)
40                    |> List.concatMap (charMatch searchLength z2)
41
42            ml =
43                List.length m
44
45            t =
46                m
47                    |> List.map (transposition z2 >> toFloat >> (*) 0.5)
48                    |> List.sum
49
50            ml1 =
51                toFloat ml / toFloat l1
52
53            ml2 =
54                toFloat ml / toFloat l2
55
56            mtm =
57                (toFloat ml - t) / toFloat ml
58        in
59        if ml == 0 then
60            0
61
62        else
63            (1 / 3) * (ml1 + ml2 + mtm)
64
65
66winkler : String -> String -> Float -> Float
67winkler s1 s2 jaro_ =
68    if s1 == "" || s2 == "" then
69        0.0
70
71    else if s1 == s2 then
72        1.0
73
74    else
75        let
76            l =
77                consecutiveChars s1 s2
78                    |> String.length
79                    |> toFloat
80
81            p =
82                0.25
83        in
84        jaro_ + ((l * p) * (1.0 - jaro_))
85
86
87jaroWinkler : String -> String -> Float
88jaroWinkler s1 s2 =
89    if s1 == "" || s2 == "" then
90        0.0
91
92    else if s1 == s2 then
93        1.0
94
95    else
96        jaro s1 s2
97            |> winkler s1 s2
98
99
100consecutiveChars : String -> String -> String
101consecutiveChars s1 s2 =
102    if s1 == "" || s2 == "" then
103        ""
104
105    else if s1 == s2 then
106        s1
107
108    else
109        cp (String.toList s1) (String.toList s2) []
110            |> String.fromList
111
112
113cp : List Char -> List Char -> List Char -> List Char
114cp l1 l2 acc =
115    case ( l1, l2 ) of
116        ( x :: xs, y :: ys ) ->
117            if x == y then
118                cp xs ys (acc ++ [ x ])
119
120            else if List.length acc > 0 then
121                -- If we have already found matches, we bail. We only want
122                -- consecutive matches.
123                acc
124
125            else
126                -- Go through every character in l1 until it matches the first
127                -- character in l2, and then start counting from there.
128                cp l1 ys acc
129
130        _ ->
131            acc
132
133
134charMatch : Int -> List ( Int, Int ) -> ( Int, Int ) -> List ( Int, Int )
135charMatch matchRange list ( p, q ) =
136    list
137        |> List.drop (p - matchRange - 1)
138        |> List.take (p + matchRange)
139        |> List.filter (Tuple.second >> (==) q)
140
141
142transposition : List ( Int, Int ) -> ( Int, Int ) -> Int
143transposition list ( p, q ) =
144    list
145        |> List.filter
146            (\( x, y ) ->
147                p /= x && q == y
148            )
149        |> List.length
150