1<?php 2 3define ( "FREQ_THRESHOLD", 40 ); 4define ( "SUGGEST_DEBUG", 0 ); 5define ( "LENGTH_THRESHOLD", 2 ); 6define ( "LEVENSHTEIN_THRESHOLD", 2 ); 7define ( "TOP_COUNT", 10 ); 8 9require ( "../../api/sphinxapi.php" ); 10 11/// build a list of trigrams for a given keywords 12function BuildTrigrams ( $keyword ) 13{ 14 $t = "__" . $keyword . "__"; 15 16 $trigrams = ""; 17 for ( $i=0; $i<strlen($t)-2; $i++ ) 18 $trigrams .= substr ( $t, $i, 3 ) . " "; 19 20 return $trigrams; 21} 22 23 24/// create SQL dump of the dictionary from Sphinx stopwords file 25/// expects open files as parameters 26function BuildDictionarySQL ( $out, $in ) 27{ 28 fwrite ( $out, "DROP TABLE IF EXISTS suggest; 29 30CREATE TABLE suggest ( 31 id INTEGER PRIMARY KEY AUTO_INCREMENT NOT NULL, 32 keyword VARCHAR(255) NOT NULL, 33 trigrams VARCHAR(255) NOT NULL, 34 freq INTEGER NOT NULL, 35 UNIQUE(keyword) 36); 37 38" ); 39 40 $n = 0; 41 $m = 0; 42 while ( $line = fgets ( $in, 1024 ) ) 43 { 44 list ( $keyword, $freq ) = split ( " ", trim ( $line ) ); 45 46 if ( $freq<FREQ_THRESHOLD || strstr ( $keyword, "_" )!==false || strstr ( $keyword, "'" )!==false ) 47 continue; 48 49 $trigrams = BuildTrigrams ( $keyword ); 50 51 if ( !$m ) 52 print "INSERT INTO suggest VALUES\n"; 53 else 54 print ",\n"; 55 56 $n++; 57 fwrite ( $out, "( 0, '$keyword', '$trigrams', $freq )" ); 58 59 $m++; 60 if ( ( $m % 10000 )==0 ) 61 { 62 print ";\n"; 63 $m = 0; 64 } 65 } 66 67 if ( $m ) 68 fwrite ( $out, ";" ); 69} 70 71 72/// search for suggestions 73function MakeSuggestion ( $keyword ) 74{ 75 $trigrams = BuildTrigrams ( $keyword ); 76 $query = "\"$trigrams\"/1"; 77 $len = strlen($keyword); 78 79 $delta = LENGTH_THRESHOLD; 80 $cl = new SphinxClient (); 81 $cl->SetMatchMode ( SPH_MATCH_EXTENDED2 ); 82 $cl->SetRankingMode ( SPH_RANK_WORDCOUNT ); 83 $cl->SetFilterRange ( "len", $len-$delta, $len+$delta ); 84 $cl->SetSelect ( "*, @weight+$delta-abs(len-$len) AS myrank" ); 85 $cl->SetSortMode ( SPH_SORT_EXTENDED, "myrank DESC, freq DESC" ); 86 $cl->SetArrayResult ( true ); 87 88 // pull top-N best trigram matches and run them through Levenshtein 89 $res = $cl->Query ( $query, "suggest", 0, TOP_COUNT ); 90 91 if ( !$res || !$res["matches"] ) 92 return false; 93 94 if ( SUGGEST_DEBUG ) 95 { 96 print "--- DEBUG START ---\n"; 97 98 foreach ( $res["matches"] as $match ) 99 { 100 $w = $match["keyword"]; 101 $myrank = @$match["attrs"]["myrank"]; 102 if ( $myrank ) 103 $myrank = ", myrank=$myrank"; 104 $levdist = levenshtein ( $keyword, $w ); 105 106 print "id=$match[id], weight=$match[weight], freq={$match[attrs][freq]}{$myrank}, word=$w, levdist=$levdist\n"; 107 } 108 109 print "--- DEBUG END ---\n"; 110 } 111 112 // further restrict trigram matches with a sane Levenshtein distance limit 113 foreach ( $res["matches"] as $match ) 114 { 115 $suggested = $match["attrs"]["keyword"]; 116 if ( levenshtein ( $keyword, $suggested )<=LEVENSHTEIN_THRESHOLD ) 117 return $suggested; 118 } 119 return $keyword; 120} 121 122/// main 123if ( $_SERVER["argc"]<2 ) 124{ 125 die ( "usage:\n" 126 . "php suggest.php --builddict\treads stopwords from stdin, prints SQL dump of the dictionary to stdout\n" 127 . "php suggest.php --query WORD\tqueries Sphinx, prints suggestion\n" ); 128} 129 130if ( $_SERVER["argv"][1]=="--builddict" ) 131{ 132 $in = fopen ( "php://stdin", "r" ); 133 $out = fopen ( "php://stdout", "w+" ); 134 BuildDictionarySQL ( $out, $in ); 135} 136 137if ( $_SERVER["argv"][1]=="--query" ) 138{ 139 mysql_connect ( "localhost", "root", "" ) or die ( "mysql_connect() failed: ".mysql_error() ); 140 mysql_select_db ( "test" ) or die ( "mysql_select_db() failed: ".mysql_error() ); 141 142 $keyword = $_SERVER["argv"][2]; 143 printf ( "keyword: %s\nsuggestion: %s\n", $keyword, MakeSuggestion($keyword) ); 144} 145