1<?php
2 /**
3	* Copyright (c) 2005 Richard Heyes (http://www.phpguru.org/)
4	*
5	* All rights reserved.
6	*
7	* This script is free software.
8	*/
9
10	/**
11	* PHP5 Implementation of the Porter Stemmer algorithm. Certain elements
12	* were borrowed from the (broken) implementation by Jon Abernathy.
13	*
14	* Usage:
15	*
16	*  $stem = PorterStemmer::Stem($word);
17	*
18	* How easy is that?
19	*/
20
21	// Porter Stemmer
22class PorterStemmer
23{
24	/**
25	 * Regex for matching a consonant
26	 * @var string
27	 */
28	private static $regex_consonant = '(?:[bcdfghjklmnpqrstvwxz]|(?<=[aeiou])y|^y)';
29
30
31	/**
32	 * Regex for matching a vowel
33	 * @var string
34	 */
35	private static $regex_vowel = '(?:[aeiou]|(?<![aeiou])y)';
36
37
38	/**
39	 * Stems a word. Simple huh?
40	 *
41	 * @param  string $word Word to stem
42	 * @return string       Stemmed word
43	 */
44	public static function Stem($word)
45	{
46		static $cache = [];
47
48		if (strlen($word) <= 2) {
49			return $word;
50		}
51
52		if (isset($cache[$word])) {
53			return $cache[$word];
54		}
55
56		$initial = $word;
57		$word = self::step1ab($word);
58		$word = self::step1c($word);
59		$word = self::step2($word);
60		$word = self::step3($word);
61		$word = self::step4($word);
62		$word = self::step5($word);
63
64		$cache[$initial] = $word;
65
66		return $word;
67	}
68
69
70	/**
71	 * Step 1
72	 */
73	private static function step1ab($word)
74	{
75		// Part a
76		if (substr($word, -1) == 's') {
77			self::replace($word, 'sses', 'ss')
78				or self::replace($word, 'ies', 'i')
79				or self::replace($word, 'ss', 'ss')
80				or self::replace($word, 's', '');
81		}
82
83		// Part b
84		if (substr($word, -2, 1) != 'e' or ! self::replace($word, 'eed', 'ee', 0)) { // First rule
85			$v = self::$regex_vowel;
86
87			// ing and ed
88			if (preg_match("#$v+#", substr($word, 0, -3)) && self::replace($word, 'ing', '')
89					or preg_match("#$v+#", substr($word, 0, -2)) && self::replace($word, 'ed', '')) { // Note use of && and OR, for precedence reasons
90				// If one of above two test successful
91				if (! self::replace($word, 'at', 'ate')
92						and ! self::replace($word, 'bl', 'ble')
93						and ! self::replace($word, 'iz', 'ize')) {
94					// Double consonant ending
95					if (self::doubleConsonant($word)
96							and substr($word, -2) != 'll'
97							and substr($word, -2) != 'ss'
98							and substr($word, -2) != 'zz') {
99						$word = substr($word, 0, -1);
100					} elseif (self::m($word) == 1 and self::cvc($word)) {
101						$word .= 'e';
102					}
103				}
104			}
105		}
106
107		return $word;
108	}
109
110
111	/**
112	 * Step 1c
113	 *
114	 * @param string $word Word to stem
115	 */
116	private static function step1c($word)
117	{
118		$v = self::$regex_vowel;
119
120		if (substr($word, -1) == 'y' && preg_match("#$v+#", substr($word, 0, -1))) {
121			self::replace($word, 'y', 'i');
122		}
123
124		return $word;
125	}
126
127
128	/**
129	 * Step 2
130	 *
131	 * @param string $word Word to stem
132	 */
133	private static function step2($word)
134	{
135		switch (substr($word, -2, 1)) {
136			case 'a':
137				self::replace($word, 'ational', 'ate', 0)
138					or self::replace($word, 'tional', 'tion', 0);
139				break;
140
141			case 'c':
142				self::replace($word, 'enci', 'ence', 0)
143					or self::replace($word, 'anci', 'ance', 0);
144				break;
145
146			case 'e':
147				self::replace($word, 'izer', 'ize', 0);
148				break;
149
150			case 'g':
151				self::replace($word, 'logi', 'log', 0);
152				break;
153
154			case 'l':
155				self::replace($word, 'entli', 'ent', 0)
156					or self::replace($word, 'ousli', 'ous', 0)
157					or self::replace($word, 'alli', 'al', 0)
158					or self::replace($word, 'bli', 'ble', 0)
159					or self::replace($word, 'eli', 'e', 0);
160				break;
161
162			case 'o':
163				self::replace($word, 'ization', 'ize', 0)
164					or self::replace($word, 'ation', 'ate', 0)
165					or self::replace($word, 'ator', 'ate', 0);
166				break;
167
168			case 's':
169				self::replace($word, 'iveness', 'ive', 0)
170					or self::replace($word, 'fulness', 'ful', 0)
171					or self::replace($word, 'ousness', 'ous', 0)
172					or self::replace($word, 'alism', 'al', 0);
173				break;
174
175			case 't':
176				self::replace($word, 'biliti', 'ble', 0)
177					or self::replace($word, 'aliti', 'al', 0)
178					or self::replace($word, 'iviti', 'ive', 0);
179				break;
180		}
181
182		return $word;
183	}
184
185
186	/**
187	 * Step 3
188	 *
189	 * @param string $word String to stem
190	 */
191	private static function step3($word)
192	{
193		switch (substr($word, -2, 1)) {
194			case 'a':
195				self::replace($word, 'ical', 'ic', 0);
196				break;
197
198			case 's':
199				self::replace($word, 'ness', '', 0);
200				break;
201
202			case 't':
203				self::replace($word, 'icate', 'ic', 0)
204					or self::replace($word, 'iciti', 'ic', 0);
205				break;
206
207			case 'u':
208				self::replace($word, 'ful', '', 0);
209				break;
210
211			case 'v':
212				self::replace($word, 'ative', '', 0);
213				break;
214
215			case 'z':
216				self::replace($word, 'alize', 'al', 0);
217				break;
218		}
219
220		return $word;
221	}
222
223
224	/**
225	 * Step 4
226	 *
227	 * @param string $word Word to stem
228	 */
229	private static function step4($word)
230	{
231		switch (substr($word, -2, 1)) {
232			case 'a':
233				self::replace($word, 'al', '', 1);
234				break;
235
236			case 'c':
237				self::replace($word, 'ance', '', 1)
238					or self::replace($word, 'ence', '', 1);
239				break;
240
241			case 'e':
242				self::replace($word, 'er', '', 1);
243				break;
244
245			case 'i':
246				self::replace($word, 'ic', '', 1);
247				break;
248
249			case 'l':
250				self::replace($word, 'able', '', 1)
251					or self::replace($word, 'ible', '', 1);
252				break;
253
254			case 'n':
255				self::replace($word, 'ant', '', 1)
256					or self::replace($word, 'ement', '', 1)
257					or self::replace($word, 'ment', '', 1)
258					or self::replace($word, 'ent', '', 1);
259				break;
260
261			case 'o':
262				if (substr($word, -4) == 'tion' or substr($word, -4) == 'sion') {
263					self::replace($word, 'ion', '', 1);
264				} else {
265					self::replace($word, 'ou', '', 1);
266				}
267				break;
268
269			case 's':
270				self::replace($word, 'ism', '', 1);
271				break;
272
273			case 't':
274				self::replace($word, 'ate', '', 1)
275					or self::replace($word, 'iti', '', 1);
276				break;
277
278			case 'u':
279				self::replace($word, 'ous', '', 1);
280				break;
281
282			case 'v':
283				self::replace($word, 'ive', '', 1);
284				break;
285
286			case 'z':
287				self::replace($word, 'ize', '', 1);
288				break;
289		}
290
291		return $word;
292	}
293
294
295	/**
296	 * Step 5
297	 *
298	 * @param string $word Word to stem
299	 */
300	private static function step5($word)
301	{
302		// Part a
303		if (substr($word, -1) == 'e') {
304			if (self::m(substr($word, 0, -1)) > 1) {
305				self::replace($word, 'e', '');
306			} elseif (self::m(substr($word, 0, -1)) == 1) {
307				if (! self::cvc(substr($word, 0, -1))) {
308					self::replace($word, 'e', '');
309				}
310			}
311		}
312
313		// Part b
314		if (self::m($word) > 1 and self::doubleConsonant($word) and substr($word, -1) == 'l') {
315			$word = substr($word, 0, -1);
316		}
317
318		return $word;
319	}
320
321
322	/**
323	 * Replaces the first string with the second, at the end of the string. If third
324	 * arg is given, then the preceding string must match that m count at least.
325	 *
326	 * @param  string $str   String to check
327	 * @param  string $check Ending to check for
328	 * @param  string $repl  Replacement string
329	 * @param  int    $m     Optional minimum number of m() to meet
330	 * @return bool          Whether the $check string was at the end
331	 *                       of the $str string. True does not necessarily mean
332	 *                       that it was replaced.
333	 */
334	private static function replace(&$str, $check, $repl, $m = null)
335	{
336		$len = 0 - strlen($check);
337
338		if (substr($str, $len) == $check) {
339			$substr = substr($str, 0, $len);
340			if (is_null($m) or self::m($substr) > $m) {
341				$str = $substr . $repl;
342			}
343
344			return true;
345		}
346
347		return false;
348	}
349
350
351	/**
352	 * What, you mean it's not obvious from the name?
353	 *
354	 * m() measures the number of consonant sequences in $str. if c is
355	 * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
356	 * presence,
357	 *
358	 * <c><v>       gives 0
359	 * <c>vc<v>     gives 1
360	 * <c>vcvc<v>   gives 2
361	 * <c>vcvcvc<v> gives 3
362	 *
363	 * @param  string $str The string to return the m count for
364	 * @return int         The m count
365	 */
366	private static function m($str)
367	{
368		$c = self::$regex_consonant;
369		$v = self::$regex_vowel;
370
371		$str = preg_replace("#^$c+#", '', $str);
372		$str = preg_replace("#$v+$#", '', $str);
373
374		preg_match_all("#($v+$c+)#", $str, $matches);
375
376		return count($matches[1]);
377	}
378
379
380	/**
381	 * Returns true/false as to whether the given string contains two
382	 * of the same consonant next to each other at the end of the string.
383	 *
384	 * @param  string $str String to check
385	 * @return bool        Result
386	 */
387	private static function doubleConsonant($str)
388	{
389		$c = self::$regex_consonant;
390
391		return preg_match("#$c{2}$#", $str, $matches) and $matches[0]{0} == $matches[0]{1};
392	}
393
394
395	/**
396	 * Checks for ending CVC sequence where second C is not W, X or Y
397	 *
398	 * @param  string $str String to check
399	 * @return bool        Result
400	 */
401	private static function cvc($str)
402	{
403		$c = self::$regex_consonant;
404		$v = self::$regex_vowel;
405
406		return     preg_match("#($c$v$c)$#", $str, $matches)
407			and strlen($matches[1]) == 3
408			and $matches[1]{2} != 'w'
409			and $matches[1]{2} != 'x'
410			and $matches[1]{2} != 'y';
411	}
412}
413