1<?php 2/** 3 * Zend Framework (http://framework.zend.com/) 4 * 5 * @link http://github.com/zendframework/zf2 for the canonical source repository 6 * @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (http://www.zend.com) 7 * @license http://framework.zend.com/license/new-bsd New BSD License 8 * @package Zend_Search 9 */ 10 11namespace ZendSearch\Lucene\Search\Similarity; 12 13/** 14 * @todo !!!!!!! This class is actually used as singleton. It has to be redesigned. 15 */ 16 17/** 18 * @category Zend 19 * @package Zend_Search_Lucene 20 * @subpackage Search 21 */ 22abstract class AbstractSimilarity 23{ 24 /** 25 * The Similarity implementation used by default. 26 * 27 * @var AbstractSimilarity 28 */ 29 private static $_defaultImpl; 30 31 /** 32 * Cache of decoded bytes. 33 * Array of floats 34 * 35 * @var array 36 */ 37 private static $_normTable = array( 0 => 0.0, 38 1 => 5.820766E-10, 39 2 => 6.9849193E-10, 40 3 => 8.1490725E-10, 41 4 => 9.313226E-10, 42 5 => 1.1641532E-9, 43 6 => 1.3969839E-9, 44 7 => 1.6298145E-9, 45 8 => 1.8626451E-9, 46 9 => 2.3283064E-9, 47 10 => 2.7939677E-9, 48 11 => 3.259629E-9, 49 12 => 3.7252903E-9, 50 13 => 4.656613E-9, 51 14 => 5.5879354E-9, 52 15 => 6.519258E-9, 53 16 => 7.4505806E-9, 54 17 => 9.313226E-9, 55 18 => 1.1175871E-8, 56 19 => 1.3038516E-8, 57 20 => 1.4901161E-8, 58 21 => 1.8626451E-8, 59 22 => 2.2351742E-8, 60 23 => 2.6077032E-8, 61 24 => 2.9802322E-8, 62 25 => 3.7252903E-8, 63 26 => 4.4703484E-8, 64 27 => 5.2154064E-8, 65 28 => 5.9604645E-8, 66 29 => 7.4505806E-8, 67 30 => 8.940697E-8, 68 31 => 1.0430813E-7, 69 32 => 1.1920929E-7, 70 33 => 1.4901161E-7, 71 34 => 1.7881393E-7, 72 35 => 2.0861626E-7, 73 36 => 2.3841858E-7, 74 37 => 2.9802322E-7, 75 38 => 3.5762787E-7, 76 39 => 4.172325E-7, 77 40 => 4.7683716E-7, 78 41 => 5.9604645E-7, 79 42 => 7.1525574E-7, 80 43 => 8.34465E-7, 81 44 => 9.536743E-7, 82 45 => 1.1920929E-6, 83 46 => 1.4305115E-6, 84 47 => 1.66893E-6, 85 48 => 1.9073486E-6, 86 49 => 2.3841858E-6, 87 50 => 2.861023E-6, 88 51 => 3.33786E-6, 89 52 => 3.8146973E-6, 90 53 => 4.7683716E-6, 91 54 => 5.722046E-6, 92 55 => 6.67572E-6, 93 56 => 7.6293945E-6, 94 57 => 9.536743E-6, 95 58 => 1.1444092E-5, 96 59 => 1.335144E-5, 97 60 => 1.5258789E-5, 98 61 => 1.9073486E-5, 99 62 => 2.2888184E-5, 100 63 => 2.670288E-5, 101 64 => 3.0517578E-5, 102 65 => 3.8146973E-5, 103 66 => 4.5776367E-5, 104 67 => 5.340576E-5, 105 68 => 6.1035156E-5, 106 69 => 7.6293945E-5, 107 70 => 9.1552734E-5, 108 71 => 1.0681152E-4, 109 72 => 1.2207031E-4, 110 73 => 1.5258789E-4, 111 74 => 1.8310547E-4, 112 75 => 2.1362305E-4, 113 76 => 2.4414062E-4, 114 77 => 3.0517578E-4, 115 78 => 3.6621094E-4, 116 79 => 4.272461E-4, 117 80 => 4.8828125E-4, 118 81 => 6.1035156E-4, 119 82 => 7.324219E-4, 120 83 => 8.544922E-4, 121 84 => 9.765625E-4, 122 85 => 0.0012207031, 123 86 => 0.0014648438, 124 87 => 0.0017089844, 125 88 => 0.001953125, 126 89 => 0.0024414062, 127 90 => 0.0029296875, 128 91 => 0.0034179688, 129 92 => 0.00390625, 130 93 => 0.0048828125, 131 94 => 0.005859375, 132 95 => 0.0068359375, 133 96 => 0.0078125, 134 97 => 0.009765625, 135 98 => 0.01171875, 136 99 => 0.013671875, 137 100 => 0.015625, 138 101 => 0.01953125, 139 102 => 0.0234375, 140 103 => 0.02734375, 141 104 => 0.03125, 142 105 => 0.0390625, 143 106 => 0.046875, 144 107 => 0.0546875, 145 108 => 0.0625, 146 109 => 0.078125, 147 110 => 0.09375, 148 111 => 0.109375, 149 112 => 0.125, 150 113 => 0.15625, 151 114 => 0.1875, 152 115 => 0.21875, 153 116 => 0.25, 154 117 => 0.3125, 155 118 => 0.375, 156 119 => 0.4375, 157 120 => 0.5, 158 121 => 0.625, 159 122 => 0.75, 160 123 => 0.875, 161 124 => 1.0, 162 125 => 1.25, 163 126 => 1.5, 164 127 => 1.75, 165 128 => 2.0, 166 129 => 2.5, 167 130 => 3.0, 168 131 => 3.5, 169 132 => 4.0, 170 133 => 5.0, 171 134 => 6.0, 172 135 => 7.0, 173 136 => 8.0, 174 137 => 10.0, 175 138 => 12.0, 176 139 => 14.0, 177 140 => 16.0, 178 141 => 20.0, 179 142 => 24.0, 180 143 => 28.0, 181 144 => 32.0, 182 145 => 40.0, 183 146 => 48.0, 184 147 => 56.0, 185 148 => 64.0, 186 149 => 80.0, 187 150 => 96.0, 188 151 => 112.0, 189 152 => 128.0, 190 153 => 160.0, 191 154 => 192.0, 192 155 => 224.0, 193 156 => 256.0, 194 157 => 320.0, 195 158 => 384.0, 196 159 => 448.0, 197 160 => 512.0, 198 161 => 640.0, 199 162 => 768.0, 200 163 => 896.0, 201 164 => 1024.0, 202 165 => 1280.0, 203 166 => 1536.0, 204 167 => 1792.0, 205 168 => 2048.0, 206 169 => 2560.0, 207 170 => 3072.0, 208 171 => 3584.0, 209 172 => 4096.0, 210 173 => 5120.0, 211 174 => 6144.0, 212 175 => 7168.0, 213 176 => 8192.0, 214 177 => 10240.0, 215 178 => 12288.0, 216 179 => 14336.0, 217 180 => 16384.0, 218 181 => 20480.0, 219 182 => 24576.0, 220 183 => 28672.0, 221 184 => 32768.0, 222 185 => 40960.0, 223 186 => 49152.0, 224 187 => 57344.0, 225 188 => 65536.0, 226 189 => 81920.0, 227 190 => 98304.0, 228 191 => 114688.0, 229 192 => 131072.0, 230 193 => 163840.0, 231 194 => 196608.0, 232 195 => 229376.0, 233 196 => 262144.0, 234 197 => 327680.0, 235 198 => 393216.0, 236 199 => 458752.0, 237 200 => 524288.0, 238 201 => 655360.0, 239 202 => 786432.0, 240 203 => 917504.0, 241 204 => 1048576.0, 242 205 => 1310720.0, 243 206 => 1572864.0, 244 207 => 1835008.0, 245 208 => 2097152.0, 246 209 => 2621440.0, 247 210 => 3145728.0, 248 211 => 3670016.0, 249 212 => 4194304.0, 250 213 => 5242880.0, 251 214 => 6291456.0, 252 215 => 7340032.0, 253 216 => 8388608.0, 254 217 => 1.048576E7, 255 218 => 1.2582912E7, 256 219 => 1.4680064E7, 257 220 => 1.6777216E7, 258 221 => 2.097152E7, 259 222 => 2.5165824E7, 260 223 => 2.9360128E7, 261 224 => 3.3554432E7, 262 225 => 4.194304E7, 263 226 => 5.0331648E7, 264 227 => 5.8720256E7, 265 228 => 6.7108864E7, 266 229 => 8.388608E7, 267 230 => 1.00663296E8, 268 231 => 1.17440512E8, 269 232 => 1.34217728E8, 270 233 => 1.6777216E8, 271 234 => 2.01326592E8, 272 235 => 2.34881024E8, 273 236 => 2.68435456E8, 274 237 => 3.3554432E8, 275 238 => 4.02653184E8, 276 239 => 4.69762048E8, 277 240 => 5.3687091E8, 278 241 => 6.7108864E8, 279 242 => 8.0530637E8, 280 243 => 9.395241E8, 281 244 => 1.07374182E9, 282 245 => 1.34217728E9, 283 246 => 1.61061274E9, 284 247 => 1.87904819E9, 285 248 => 2.14748365E9, 286 249 => 2.68435456E9, 287 250 => 3.22122547E9, 288 251 => 3.75809638E9, 289 252 => 4.2949673E9, 290 253 => 5.3687091E9, 291 254 => 6.4424509E9, 292 255 => 7.5161928E9 ); 293 294 295 /** 296 * Set the default Similarity implementation used by indexing and search 297 * code. 298 * 299 * @param AbstractSimilarity $similarity 300 */ 301 public static function setDefault(AbstractSimilarity $similarity) 302 { 303 self::$_defaultImpl = $similarity; 304 } 305 306 307 /** 308 * Return the default Similarity implementation used by indexing and search 309 * code. 310 * 311 * @return AbstractSimilarity 312 */ 313 public static function getDefault() 314 { 315 if (!self::$_defaultImpl instanceof AbstractSimilarity) { 316 self::$_defaultImpl = new DefaultSimilarity(); 317 } 318 319 return self::$_defaultImpl; 320 } 321 322 323 /** 324 * Computes the normalization value for a field given the total number of 325 * terms contained in a field. These values, together with field boosts, are 326 * stored in an index and multipled into scores for hits on each field by the 327 * search code. 328 * 329 * Matches in longer fields are less precise, so implemenations of this 330 * method usually return smaller values when 'numTokens' is large, 331 * and larger values when 'numTokens' is small. 332 * 333 * That these values are computed under 334 * IndexWriter::addDocument(Document) and stored then using 335 * encodeNorm(float). Thus they have limited precision, and documents 336 * must be re-indexed if this method is altered. 337 * 338 * fieldName - name of field 339 * numTokens - the total number of tokens contained in fields named 340 * 'fieldName' of 'doc'. 341 * Returns a normalization factor for hits on this field of this document 342 * 343 * @param string $fieldName 344 * @param integer $numTokens 345 * @return float 346 */ 347 abstract public function lengthNorm($fieldName, $numTokens); 348 349 /** 350 * Computes the normalization value for a query given the sum of the squared 351 * weights of each of the query terms. This value is then multipled into the 352 * weight of each query term. 353 * 354 * This does not affect ranking, but rather just attempts to make scores 355 * from different queries comparable. 356 * 357 * sumOfSquaredWeights - the sum of the squares of query term weights 358 * Returns a normalization factor for query weights 359 * 360 * @param float $sumOfSquaredWeights 361 * @return float 362 */ 363 abstract public function queryNorm($sumOfSquaredWeights); 364 365 366 /** 367 * Decodes a normalization factor stored in an index. 368 * 369 * @param integer $byte 370 * @return float 371 */ 372 public static function decodeNorm($byte) 373 { 374 return self::$_normTable[$byte & 0xFF]; 375 } 376 377 378 /** 379 * Encodes a normalization factor for storage in an index. 380 * 381 * The encoding uses a five-bit exponent and three-bit mantissa, thus 382 * representing values from around 7x10^9 to 2x10^-9 with about one 383 * significant decimal digit of accuracy. Zero is also represented. 384 * Negative numbers are rounded up to zero. Values too large to represent 385 * are rounded down to the largest representable value. Positive values too 386 * small to represent are rounded up to the smallest positive representable 387 * value. 388 * 389 * @param float $f 390 * @return integer 391 */ 392 public static function encodeNorm($f) 393 { 394 return self::_floatToByte($f); 395 } 396 397 /** 398 * Float to byte conversion 399 * 400 * @param integer $b 401 * @return float 402 */ 403 private static function _floatToByte($f) 404 { 405 // round negatives up to zero 406 if ($f <= 0.0) { 407 return 0; 408 } 409 410 // search for appropriate value 411 $lowIndex = 0; 412 $highIndex = 255; 413 while ($highIndex >= $lowIndex) { 414 // $mid = ($highIndex - $lowIndex)/2; 415 $mid = ($highIndex + $lowIndex) >> 1; 416 $delta = $f - self::$_normTable[$mid]; 417 418 if ($delta < 0) { 419 $highIndex = $mid-1; 420 } elseif ($delta > 0) { 421 $lowIndex = $mid+1; 422 } else { 423 return $mid; // We got it! 424 } 425 } 426 427 // round to closest value 428 if ($highIndex != 255 && 429 $f - self::$_normTable[$highIndex] > self::$_normTable[$highIndex+1] - $f ) { 430 return $highIndex + 1; 431 } else { 432 return $highIndex; 433 } 434 } 435 436 437 /** 438 * Computes a score factor based on a term or phrase's frequency in a 439 * document. This value is multiplied by the idf(Term, Searcher) 440 * factor for each term in the query and these products are then summed to 441 * form the initial score for a document. 442 * 443 * Terms and phrases repeated in a document indicate the topic of the 444 * document, so implementations of this method usually return larger values 445 * when 'freq' is large, and smaller values when 'freq' 446 * is small. 447 * 448 * freq - the frequency of a term within a document 449 * Returns a score factor based on a term's within-document frequency 450 * 451 * @param float $freq 452 * @return float 453 */ 454 abstract public function tf($freq); 455 456 /** 457 * Computes the amount of a sloppy phrase match, based on an edit distance. 458 * This value is summed for each sloppy phrase match in a document to form 459 * the frequency that is passed to tf(float). 460 * 461 * A phrase match with a small edit distance to a document passage more 462 * closely matches the document, so implementations of this method usually 463 * return larger values when the edit distance is small and smaller values 464 * when it is large. 465 * 466 * distance - the edit distance of this sloppy phrase match 467 * Returns the frequency increment for this match 468 * 469 * @param integer $distance 470 * @return float 471 */ 472 abstract public function sloppyFreq($distance); 473 474 475 /** 476 * Computes a score factor for a simple term or a phrase. 477 * 478 * The default implementation is: 479 * return idfFreq(searcher.docFreq(term), searcher.maxDoc()); 480 * 481 * input - the term in question or array of terms 482 * reader - reader the document collection being searched 483 * Returns a score factor for the term 484 * 485 * @param mixed $input 486 * @param \ZendSearch\Lucene\SearchIndexInterface $reader 487 * @return float a score factor for the term 488 */ 489 public function idf($input, \ZendSearch\Lucene\SearchIndexInterface $reader) 490 { 491 if (!is_array($input)) { 492 return $this->idfFreq($reader->docFreq($input), $reader->count()); 493 } else { 494 $idf = 0.0; 495 foreach ($input as $term) { 496 $idf += $this->idfFreq($reader->docFreq($term), $reader->count()); 497 } 498 return $idf; 499 } 500 } 501 502 /** 503 * Computes a score factor based on a term's document frequency (the number 504 * of documents which contain the term). This value is multiplied by the 505 * tf(int) factor for each term in the query and these products are 506 * then summed to form the initial score for a document. 507 * 508 * Terms that occur in fewer documents are better indicators of topic, so 509 * implemenations of this method usually return larger values for rare terms, 510 * and smaller values for common terms. 511 * 512 * docFreq - the number of documents which contain the term 513 * numDocs - the total number of documents in the collection 514 * Returns a score factor based on the term's document frequency 515 * 516 * @param integer $docFreq 517 * @param integer $numDocs 518 * @return float 519 */ 520 abstract public function idfFreq($docFreq, $numDocs); 521 522 /** 523 * Computes a score factor based on the fraction of all query terms that a 524 * document contains. This value is multiplied into scores. 525 * 526 * The presence of a large portion of the query terms indicates a better 527 * match with the query, so implemenations of this method usually return 528 * larger values when the ratio between these parameters is large and smaller 529 * values when the ratio between them is small. 530 * 531 * overlap - the number of query terms matched in the document 532 * maxOverlap - the total number of terms in the query 533 * Returns a score factor based on term overlap with the query 534 * 535 * @param integer $overlap 536 * @param integer $maxOverlap 537 * @return float 538 */ 539 abstract public function coord($overlap, $maxOverlap); 540} 541