1<?php
2
3define ( "FREQ_THRESHOLD", 40 );
4define ( "SUGGEST_DEBUG", 0 );
5define ( "LENGTH_THRESHOLD", 2 );
6define ( "LEVENSHTEIN_THRESHOLD", 2 );
7define ( "TOP_COUNT", 10 );
8
9require ( "../../api/sphinxapi.php" );
10
11/// build a list of trigrams for a given keywords
12function BuildTrigrams ( $keyword )
13{
14	$t = "__" . $keyword . "__";
15
16	$trigrams = "";
17	for ( $i=0; $i<strlen($t)-2; $i++ )
18		$trigrams .= substr ( $t, $i, 3 ) . " ";
19
20	return $trigrams;
21}
22
23
24/// create SQL dump of the dictionary from Sphinx stopwords file
25/// expects open files as parameters
26function BuildDictionarySQL ( $out, $in )
27{
28	fwrite ( $out, "DROP TABLE IF EXISTS suggest;
29
30CREATE TABLE suggest (
31	id			INTEGER PRIMARY KEY AUTO_INCREMENT NOT NULL,
32	keyword		VARCHAR(255) NOT NULL,
33	trigrams	VARCHAR(255) NOT NULL,
34	freq		INTEGER NOT NULL,
35	UNIQUE(keyword)
36);
37
38" );
39
40	$n = 0;
41	$m = 0;
42	while ( $line = fgets ( $in, 1024 ) )
43	{
44		list ( $keyword, $freq ) = split ( " ", trim ( $line ) );
45
46		if ( $freq<FREQ_THRESHOLD || strstr ( $keyword, "_" )!==false || strstr ( $keyword, "'" )!==false )
47			continue;
48
49		$trigrams = BuildTrigrams ( $keyword );
50
51		if ( !$m )
52			print "INSERT INTO suggest VALUES\n";
53		else
54			print ",\n";
55
56		$n++;
57		fwrite ( $out, "( 0, '$keyword', '$trigrams', $freq )" );
58
59		$m++;
60		if ( ( $m % 10000 )==0 )
61		{
62			print ";\n";
63			$m = 0;
64		}
65	}
66
67	if ( $m )
68		fwrite ( $out, ";" );
69}
70
71
72/// search for suggestions
73function MakeSuggestion ( $keyword )
74{
75	$trigrams = BuildTrigrams ( $keyword );
76	$query = "\"$trigrams\"/1";
77	$len = strlen($keyword);
78
79	$delta = LENGTH_THRESHOLD;
80	$cl = new SphinxClient ();
81	$cl->SetMatchMode ( SPH_MATCH_EXTENDED2 );
82	$cl->SetRankingMode ( SPH_RANK_WORDCOUNT );
83	$cl->SetFilterRange ( "len", $len-$delta, $len+$delta );
84	$cl->SetSelect ( "*, @weight+$delta-abs(len-$len) AS myrank" );
85	$cl->SetSortMode ( SPH_SORT_EXTENDED, "myrank DESC, freq DESC" );
86  	$cl->SetArrayResult ( true );
87
88  	// pull top-N best trigram matches and run them through Levenshtein
89	$res = $cl->Query ( $query, "suggest", 0, TOP_COUNT );
90
91	if ( !$res || !$res["matches"] )
92		return false;
93
94	if ( SUGGEST_DEBUG )
95	{
96		print "--- DEBUG START ---\n";
97
98		foreach ( $res["matches"] as $match )
99		{
100			$w = $match["keyword"];
101			$myrank = @$match["attrs"]["myrank"];
102			if ( $myrank )
103				$myrank = ", myrank=$myrank";
104			$levdist = levenshtein ( $keyword, $w );
105
106			print "id=$match[id], weight=$match[weight], freq={$match[attrs][freq]}{$myrank}, word=$w, levdist=$levdist\n";
107		}
108
109		print "--- DEBUG END ---\n";
110	}
111
112	// further restrict trigram matches with a sane Levenshtein distance limit
113	foreach ( $res["matches"] as $match )
114	{
115		$suggested = $match["attrs"]["keyword"];
116		if ( levenshtein ( $keyword, $suggested )<=LEVENSHTEIN_THRESHOLD )
117			return $suggested;
118	}
119	return $keyword;
120}
121
122/// main
123if ( $_SERVER["argc"]<2 )
124{
125	die ( "usage:\n"
126		. "php suggest.php --builddict\treads stopwords from stdin, prints SQL dump of the dictionary to stdout\n"
127		. "php suggest.php --query WORD\tqueries Sphinx, prints suggestion\n" );
128}
129
130if ( $_SERVER["argv"][1]=="--builddict" )
131{
132	$in = fopen ( "php://stdin", "r" );
133	$out = fopen ( "php://stdout", "w+" );
134	BuildDictionarySQL ( $out, $in );
135}
136
137if ( $_SERVER["argv"][1]=="--query" )
138{
139	mysql_connect ( "localhost", "root", "" ) or die ( "mysql_connect() failed: ".mysql_error() );
140	mysql_select_db ( "test" ) or die ( "mysql_select_db() failed: ".mysql_error() );
141
142	$keyword = $_SERVER["argv"][2];
143	printf ( "keyword: %s\nsuggestion: %s\n", $keyword, MakeSuggestion($keyword) );
144}
145