1<?php
2/**
3 * Zend Framework (http://framework.zend.com/)
4 *
5 * @link      http://github.com/zendframework/zf2 for the canonical source repository
6 * @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com)
7 * @license   http://framework.zend.com/license/new-bsd New BSD License
8 * @package   Zend_Search
9 */
10
11namespace ZendSearch\Lucene\Search\Similarity;
12
13/**
14 * @todo !!!!!!! This class is actually used as singleton. It has to be redesigned.
15 */
16
17/**
18 * @category   Zend
19 * @package    Zend_Search_Lucene
20 * @subpackage Search
21 */
22abstract class AbstractSimilarity
23{
24    /**
25     * The Similarity implementation used by default.
26     *
27     * @var AbstractSimilarity
28     */
29    private static $_defaultImpl;
30
31    /**
32     * Cache of decoded bytes.
33     * Array of floats
34     *
35     * @var array
36     */
37    private static $_normTable = array( 0   => 0.0,
38                                        1   => 5.820766E-10,
39                                        2   => 6.9849193E-10,
40                                        3   => 8.1490725E-10,
41                                        4   => 9.313226E-10,
42                                        5   => 1.1641532E-9,
43                                        6   => 1.3969839E-9,
44                                        7   => 1.6298145E-9,
45                                        8   => 1.8626451E-9,
46                                        9   => 2.3283064E-9,
47                                        10  => 2.7939677E-9,
48                                        11  => 3.259629E-9,
49                                        12  => 3.7252903E-9,
50                                        13  => 4.656613E-9,
51                                        14  => 5.5879354E-9,
52                                        15  => 6.519258E-9,
53                                        16  => 7.4505806E-9,
54                                        17  => 9.313226E-9,
55                                        18  => 1.1175871E-8,
56                                        19  => 1.3038516E-8,
57                                        20  => 1.4901161E-8,
58                                        21  => 1.8626451E-8,
59                                        22  => 2.2351742E-8,
60                                        23  => 2.6077032E-8,
61                                        24  => 2.9802322E-8,
62                                        25  => 3.7252903E-8,
63                                        26  => 4.4703484E-8,
64                                        27  => 5.2154064E-8,
65                                        28  => 5.9604645E-8,
66                                        29  => 7.4505806E-8,
67                                        30  => 8.940697E-8,
68                                        31  => 1.0430813E-7,
69                                        32  => 1.1920929E-7,
70                                        33  => 1.4901161E-7,
71                                        34  => 1.7881393E-7,
72                                        35  => 2.0861626E-7,
73                                        36  => 2.3841858E-7,
74                                        37  => 2.9802322E-7,
75                                        38  => 3.5762787E-7,
76                                        39  => 4.172325E-7,
77                                        40  => 4.7683716E-7,
78                                        41  => 5.9604645E-7,
79                                        42  => 7.1525574E-7,
80                                        43  => 8.34465E-7,
81                                        44  => 9.536743E-7,
82                                        45  => 1.1920929E-6,
83                                        46  => 1.4305115E-6,
84                                        47  => 1.66893E-6,
85                                        48  => 1.9073486E-6,
86                                        49  => 2.3841858E-6,
87                                        50  => 2.861023E-6,
88                                        51  => 3.33786E-6,
89                                        52  => 3.8146973E-6,
90                                        53  => 4.7683716E-6,
91                                        54  => 5.722046E-6,
92                                        55  => 6.67572E-6,
93                                        56  => 7.6293945E-6,
94                                        57  => 9.536743E-6,
95                                        58  => 1.1444092E-5,
96                                        59  => 1.335144E-5,
97                                        60  => 1.5258789E-5,
98                                        61  => 1.9073486E-5,
99                                        62  => 2.2888184E-5,
100                                        63  => 2.670288E-5,
101                                        64  => 3.0517578E-5,
102                                        65  => 3.8146973E-5,
103                                        66  => 4.5776367E-5,
104                                        67  => 5.340576E-5,
105                                        68  => 6.1035156E-5,
106                                        69  => 7.6293945E-5,
107                                        70  => 9.1552734E-5,
108                                        71  => 1.0681152E-4,
109                                        72  => 1.2207031E-4,
110                                        73  => 1.5258789E-4,
111                                        74  => 1.8310547E-4,
112                                        75  => 2.1362305E-4,
113                                        76  => 2.4414062E-4,
114                                        77  => 3.0517578E-4,
115                                        78  => 3.6621094E-4,
116                                        79  => 4.272461E-4,
117                                        80  => 4.8828125E-4,
118                                        81  => 6.1035156E-4,
119                                        82  => 7.324219E-4,
120                                        83  => 8.544922E-4,
121                                        84  => 9.765625E-4,
122                                        85  => 0.0012207031,
123                                        86  => 0.0014648438,
124                                        87  => 0.0017089844,
125                                        88  => 0.001953125,
126                                        89  => 0.0024414062,
127                                        90  => 0.0029296875,
128                                        91  => 0.0034179688,
129                                        92  => 0.00390625,
130                                        93  => 0.0048828125,
131                                        94  => 0.005859375,
132                                        95  => 0.0068359375,
133                                        96  => 0.0078125,
134                                        97  => 0.009765625,
135                                        98  => 0.01171875,
136                                        99  => 0.013671875,
137                                        100 => 0.015625,
138                                        101 => 0.01953125,
139                                        102 => 0.0234375,
140                                        103 => 0.02734375,
141                                        104 => 0.03125,
142                                        105 => 0.0390625,
143                                        106 => 0.046875,
144                                        107 => 0.0546875,
145                                        108 => 0.0625,
146                                        109 => 0.078125,
147                                        110 => 0.09375,
148                                        111 => 0.109375,
149                                        112 => 0.125,
150                                        113 => 0.15625,
151                                        114 => 0.1875,
152                                        115 => 0.21875,
153                                        116 => 0.25,
154                                        117 => 0.3125,
155                                        118 => 0.375,
156                                        119 => 0.4375,
157                                        120 => 0.5,
158                                        121 => 0.625,
159                                        122 => 0.75,
160                                        123 => 0.875,
161                                        124 => 1.0,
162                                        125 => 1.25,
163                                        126 => 1.5,
164                                        127 => 1.75,
165                                        128 => 2.0,
166                                        129 => 2.5,
167                                        130 => 3.0,
168                                        131 => 3.5,
169                                        132 => 4.0,
170                                        133 => 5.0,
171                                        134 => 6.0,
172                                        135 => 7.0,
173                                        136 => 8.0,
174                                        137 => 10.0,
175                                        138 => 12.0,
176                                        139 => 14.0,
177                                        140 => 16.0,
178                                        141 => 20.0,
179                                        142 => 24.0,
180                                        143 => 28.0,
181                                        144 => 32.0,
182                                        145 => 40.0,
183                                        146 => 48.0,
184                                        147 => 56.0,
185                                        148 => 64.0,
186                                        149 => 80.0,
187                                        150 => 96.0,
188                                        151 => 112.0,
189                                        152 => 128.0,
190                                        153 => 160.0,
191                                        154 => 192.0,
192                                        155 => 224.0,
193                                        156 => 256.0,
194                                        157 => 320.0,
195                                        158 => 384.0,
196                                        159 => 448.0,
197                                        160 => 512.0,
198                                        161 => 640.0,
199                                        162 => 768.0,
200                                        163 => 896.0,
201                                        164 => 1024.0,
202                                        165 => 1280.0,
203                                        166 => 1536.0,
204                                        167 => 1792.0,
205                                        168 => 2048.0,
206                                        169 => 2560.0,
207                                        170 => 3072.0,
208                                        171 => 3584.0,
209                                        172 => 4096.0,
210                                        173 => 5120.0,
211                                        174 => 6144.0,
212                                        175 => 7168.0,
213                                        176 => 8192.0,
214                                        177 => 10240.0,
215                                        178 => 12288.0,
216                                        179 => 14336.0,
217                                        180 => 16384.0,
218                                        181 => 20480.0,
219                                        182 => 24576.0,
220                                        183 => 28672.0,
221                                        184 => 32768.0,
222                                        185 => 40960.0,
223                                        186 => 49152.0,
224                                        187 => 57344.0,
225                                        188 => 65536.0,
226                                        189 => 81920.0,
227                                        190 => 98304.0,
228                                        191 => 114688.0,
229                                        192 => 131072.0,
230                                        193 => 163840.0,
231                                        194 => 196608.0,
232                                        195 => 229376.0,
233                                        196 => 262144.0,
234                                        197 => 327680.0,
235                                        198 => 393216.0,
236                                        199 => 458752.0,
237                                        200 => 524288.0,
238                                        201 => 655360.0,
239                                        202 => 786432.0,
240                                        203 => 917504.0,
241                                        204 => 1048576.0,
242                                        205 => 1310720.0,
243                                        206 => 1572864.0,
244                                        207 => 1835008.0,
245                                        208 => 2097152.0,
246                                        209 => 2621440.0,
247                                        210 => 3145728.0,
248                                        211 => 3670016.0,
249                                        212 => 4194304.0,
250                                        213 => 5242880.0,
251                                        214 => 6291456.0,
252                                        215 => 7340032.0,
253                                        216 => 8388608.0,
254                                        217 => 1.048576E7,
255                                        218 => 1.2582912E7,
256                                        219 => 1.4680064E7,
257                                        220 => 1.6777216E7,
258                                        221 => 2.097152E7,
259                                        222 => 2.5165824E7,
260                                        223 => 2.9360128E7,
261                                        224 => 3.3554432E7,
262                                        225 => 4.194304E7,
263                                        226 => 5.0331648E7,
264                                        227 => 5.8720256E7,
265                                        228 => 6.7108864E7,
266                                        229 => 8.388608E7,
267                                        230 => 1.00663296E8,
268                                        231 => 1.17440512E8,
269                                        232 => 1.34217728E8,
270                                        233 => 1.6777216E8,
271                                        234 => 2.01326592E8,
272                                        235 => 2.34881024E8,
273                                        236 => 2.68435456E8,
274                                        237 => 3.3554432E8,
275                                        238 => 4.02653184E8,
276                                        239 => 4.69762048E8,
277                                        240 => 5.3687091E8,
278                                        241 => 6.7108864E8,
279                                        242 => 8.0530637E8,
280                                        243 => 9.395241E8,
281                                        244 => 1.07374182E9,
282                                        245 => 1.34217728E9,
283                                        246 => 1.61061274E9,
284                                        247 => 1.87904819E9,
285                                        248 => 2.14748365E9,
286                                        249 => 2.68435456E9,
287                                        250 => 3.22122547E9,
288                                        251 => 3.75809638E9,
289                                        252 => 4.2949673E9,
290                                        253 => 5.3687091E9,
291                                        254 => 6.4424509E9,
292                                        255 => 7.5161928E9 );
293
294
295    /**
296     * Set the default Similarity implementation used by indexing and search
297     * code.
298     *
299     * @param AbstractSimilarity $similarity
300     */
301    public static function setDefault(AbstractSimilarity $similarity)
302    {
303        self::$_defaultImpl = $similarity;
304    }
305
306
307    /**
308     * Return the default Similarity implementation used by indexing and search
309     * code.
310     *
311     * @return AbstractSimilarity
312     */
313    public static function getDefault()
314    {
315        if (!self::$_defaultImpl instanceof AbstractSimilarity) {
316            self::$_defaultImpl = new DefaultSimilarity();
317        }
318
319        return self::$_defaultImpl;
320    }
321
322
323    /**
324     * Computes the normalization value for a field given the total number of
325     * terms contained in a field.  These values, together with field boosts, are
326     * stored in an index and multipled into scores for hits on each field by the
327     * search code.
328     *
329     * Matches in longer fields are less precise, so implemenations of this
330     * method usually return smaller values when 'numTokens' is large,
331     * and larger values when 'numTokens' is small.
332     *
333     * That these values are computed under
334     * IndexWriter::addDocument(Document) and stored then using
335     * encodeNorm(float).  Thus they have limited precision, and documents
336     * must be re-indexed if this method is altered.
337     *
338     * fieldName - name of field
339     * numTokens - the total number of tokens contained in fields named
340     *             'fieldName' of 'doc'.
341     * Returns a normalization factor for hits on this field of this document
342     *
343     * @param string $fieldName
344     * @param integer $numTokens
345     * @return float
346     */
347    abstract public function lengthNorm($fieldName, $numTokens);
348
349    /**
350     * Computes the normalization value for a query given the sum of the squared
351     * weights of each of the query terms.  This value is then multipled into the
352     * weight of each query term.
353     *
354     * This does not affect ranking, but rather just attempts to make scores
355     * from different queries comparable.
356     *
357     * sumOfSquaredWeights - the sum of the squares of query term weights
358     * Returns a normalization factor for query weights
359     *
360     * @param float $sumOfSquaredWeights
361     * @return float
362     */
363    abstract public function queryNorm($sumOfSquaredWeights);
364
365
366    /**
367     *  Decodes a normalization factor stored in an index.
368     *
369     * @param integer $byte
370     * @return float
371     */
372    public static function decodeNorm($byte)
373    {
374        return self::$_normTable[$byte & 0xFF];
375    }
376
377
378    /**
379     * Encodes a normalization factor for storage in an index.
380     *
381     * The encoding uses a five-bit exponent and three-bit mantissa, thus
382     * representing values from around 7x10^9 to 2x10^-9 with about one
383     * significant decimal digit of accuracy.  Zero is also represented.
384     * Negative numbers are rounded up to zero.  Values too large to represent
385     * are rounded down to the largest representable value.  Positive values too
386     * small to represent are rounded up to the smallest positive representable
387     * value.
388     *
389     * @param float $f
390     * @return integer
391     */
392    public static function encodeNorm($f)
393    {
394      return self::_floatToByte($f);
395    }
396
397    /**
398     * Float to byte conversion
399     *
400     * @param integer $b
401     * @return float
402     */
403    private static function _floatToByte($f)
404    {
405        // round negatives up to zero
406        if ($f <= 0.0) {
407            return 0;
408        }
409
410        // search for appropriate value
411        $lowIndex = 0;
412        $highIndex = 255;
413        while ($highIndex >= $lowIndex) {
414            // $mid = ($highIndex - $lowIndex)/2;
415            $mid = ($highIndex + $lowIndex) >> 1;
416            $delta = $f - self::$_normTable[$mid];
417
418            if ($delta < 0) {
419                $highIndex = $mid-1;
420            } elseif ($delta > 0) {
421                $lowIndex  = $mid+1;
422            } else {
423                return $mid; // We got it!
424            }
425        }
426
427        // round to closest value
428        if ($highIndex != 255 &&
429            $f - self::$_normTable[$highIndex] > self::$_normTable[$highIndex+1] - $f ) {
430            return $highIndex + 1;
431        } else {
432            return $highIndex;
433        }
434    }
435
436
437    /**
438     * Computes a score factor based on a term or phrase's frequency in a
439     * document.  This value is multiplied by the idf(Term, Searcher)
440     * factor for each term in the query and these products are then summed to
441     * form the initial score for a document.
442     *
443     * Terms and phrases repeated in a document indicate the topic of the
444     * document, so implementations of this method usually return larger values
445     * when 'freq' is large, and smaller values when 'freq'
446     * is small.
447     *
448     * freq - the frequency of a term within a document
449     * Returns a score factor based on a term's within-document frequency
450     *
451     * @param float $freq
452     * @return float
453     */
454    abstract public function tf($freq);
455
456    /**
457     * Computes the amount of a sloppy phrase match, based on an edit distance.
458     * This value is summed for each sloppy phrase match in a document to form
459     * the frequency that is passed to tf(float).
460     *
461     * A phrase match with a small edit distance to a document passage more
462     * closely matches the document, so implementations of this method usually
463     * return larger values when the edit distance is small and smaller values
464     * when it is large.
465     *
466     * distance - the edit distance of this sloppy phrase match
467     * Returns the frequency increment for this match
468     *
469     * @param integer $distance
470     * @return float
471     */
472    abstract public function sloppyFreq($distance);
473
474
475    /**
476     * Computes a score factor for a simple term or a phrase.
477     *
478     * The default implementation is:
479     *   return idfFreq(searcher.docFreq(term), searcher.maxDoc());
480     *
481     * input - the term in question or array of terms
482     * reader - reader the document collection being searched
483     * Returns a score factor for the term
484     *
485     * @param mixed $input
486     * @param \ZendSearch\Lucene\SearchIndexInterface $reader
487     * @return float a score factor for the term
488     */
489    public function idf($input, \ZendSearch\Lucene\SearchIndexInterface $reader)
490    {
491        if (!is_array($input)) {
492            return $this->idfFreq($reader->docFreq($input), $reader->count());
493        } else {
494            $idf = 0.0;
495            foreach ($input as $term) {
496                $idf += $this->idfFreq($reader->docFreq($term), $reader->count());
497            }
498            return $idf;
499        }
500    }
501
502    /**
503     * Computes a score factor based on a term's document frequency (the number
504     * of documents which contain the term).  This value is multiplied by the
505     * tf(int) factor for each term in the query and these products are
506     * then summed to form the initial score for a document.
507     *
508     * Terms that occur in fewer documents are better indicators of topic, so
509     * implemenations of this method usually return larger values for rare terms,
510     * and smaller values for common terms.
511     *
512     * docFreq - the number of documents which contain the term
513     * numDocs - the total number of documents in the collection
514     * Returns a score factor based on the term's document frequency
515     *
516     * @param integer $docFreq
517     * @param integer $numDocs
518     * @return float
519     */
520    abstract public function idfFreq($docFreq, $numDocs);
521
522    /**
523     * Computes a score factor based on the fraction of all query terms that a
524     * document contains.  This value is multiplied into scores.
525     *
526     * The presence of a large portion of the query terms indicates a better
527     * match with the query, so implemenations of this method usually return
528     * larger values when the ratio between these parameters is large and smaller
529     * values when the ratio between them is small.
530     *
531     * overlap - the number of query terms matched in the document
532     * maxOverlap - the total number of terms in the query
533     * Returns a score factor based on term overlap with the query
534     *
535     * @param integer $overlap
536     * @param integer $maxOverlap
537     * @return float
538     */
539    abstract public function coord($overlap, $maxOverlap);
540}
541