1<?php
2/**
3 * @package     Joomla.Administrator
4 * @subpackage  com_finder
5 *
6 * @copyright   Copyright (C) 2005 - 2020 Open Source Matters, Inc. All rights reserved.
7 * @license     GNU General Public License version 2 or later; see LICENSE.txt
8 */
9
10defined('_JEXEC') or die;
11
12use Joomla\Registry\Registry;
13use Joomla\String\StringHelper;
14
15JLoader::register('FinderIndexerParser', __DIR__ . '/parser.php');
16JLoader::register('FinderIndexerStemmer', __DIR__ . '/stemmer.php');
17JLoader::register('FinderIndexerToken', __DIR__ . '/token.php');
18
19/**
20 * Helper class for the Finder indexer package.
21 *
22 * @since  2.5
23 */
24class FinderIndexerHelper
25{
26	/**
27	 * The token stemmer object. The stemmer is set by whatever class
28	 * wishes to use it but it must be an instance of FinderIndexerStemmer.
29	 *
30	 * @var		FinderIndexerStemmer
31	 * @since	2.5
32	 */
33	public static $stemmer;
34
35	/**
36	 * A state flag, in order to not constantly check if the stemmer is an instance of FinderIndexerStemmer
37	 *
38	 * @var		boolean
39	 * @since	3.7.0
40	 */
41	protected static $stemmerOK;
42
43	/**
44	 * Method to parse input into plain text.
45	 *
46	 * @param   string  $input   The raw input.
47	 * @param   string  $format  The format of the input. [optional]
48	 *
49	 * @return  string  The parsed input.
50	 *
51	 * @since   2.5
52	 * @throws  Exception on invalid parser.
53	 */
54	public static function parse($input, $format = 'html')
55	{
56		// Get a parser for the specified format and parse the input.
57		return FinderIndexerParser::getInstance($format)->parse($input);
58	}
59
60	/**
61	 * Method to tokenize a text string.
62	 *
63	 * @param   string   $input   The input to tokenize.
64	 * @param   string   $lang    The language of the input.
65	 * @param   boolean  $phrase  Flag to indicate whether input could be a phrase. [optional]
66	 *
67	 * @return  array|FinderIndexerToken  An array of FinderIndexerToken objects or a single FinderIndexerToken object.
68	 *
69	 * @since   2.5
70	 */
71	public static function tokenize($input, $lang, $phrase = false)
72	{
73		static $cache;
74		$store = StringHelper::strlen($input) < 128 ? md5($input . '::' . $lang . '::' . $phrase) : null;
75
76		// Check if the string has been tokenized already.
77		if ($store && isset($cache[$store]))
78		{
79			return $cache[$store];
80		}
81
82		$tokens = array();
83		$quotes = html_entity_decode('&#8216;&#8217;&#39;', ENT_QUOTES, 'UTF-8');
84
85		// Get the simple language key.
86		$lang = static::getPrimaryLanguage($lang);
87
88		/*
89		 * Parsing the string input into terms is a multi-step process.
90		 *
91		 * Regexes:
92		 *  1. Remove everything except letters, numbers, quotes, apostrophe, plus, dash, period, and comma.
93		 *  2. Remove plus, dash, period, and comma characters located before letter characters.
94		 *  3. Remove plus, dash, period, and comma characters located after other characters.
95		 *  4. Remove plus, period, and comma characters enclosed in alphabetical characters. Ungreedy.
96		 *  5. Remove orphaned apostrophe, plus, dash, period, and comma characters.
97		 *  6. Remove orphaned quote characters.
98		 *  7. Replace the assorted single quotation marks with the ASCII standard single quotation.
99		 *  8. Remove multiple space characters and replaces with a single space.
100		 */
101		$input = StringHelper::strtolower($input);
102		$input = preg_replace('#[^\pL\pM\pN\p{Pi}\p{Pf}\'+-.,]+#mui', ' ', $input);
103		$input = preg_replace('#(^|\s)[+-.,]+([\pL\pM]+)#mui', ' $1', $input);
104		$input = preg_replace('#([\pL\pM\pN]+)[+-.,]+(\s|$)#mui', '$1 ', $input);
105		$input = preg_replace('#([\pL\pM]+)[+.,]+([\pL\pM]+)#muiU', '$1 $2', $input);
106		$input = preg_replace('#(^|\s)[\'+-.,]+(\s|$)#mui', ' ', $input);
107		$input = preg_replace('#(^|\s)[\p{Pi}\p{Pf}]+(\s|$)#mui', ' ', $input);
108		$input = preg_replace('#[' . $quotes . ']+#mui', '\'', $input);
109		$input = preg_replace('#\s+#mui', ' ', $input);
110		$input = trim($input);
111
112		// Explode the normalized string to get the terms.
113		$terms = explode(' ', $input);
114
115		/*
116		 * If we have Unicode support and are dealing with Chinese text, Chinese
117		 * has to be handled specially because there are not necessarily any spaces
118		 * between the "words". So, we have to test if the words belong to the Chinese
119		 * character set and if so, explode them into single glyphs or "words".
120		 */
121		if ($lang === 'zh')
122		{
123			// Iterate through the terms and test if they contain Chinese.
124			for ($i = 0, $n = count($terms); $i < $n; $i++)
125			{
126				$charMatches = array();
127				$charCount   = preg_match_all('#[\p{Han}]#mui', $terms[$i], $charMatches);
128
129				// Split apart any groups of Chinese characters.
130				for ($j = 0; $j < $charCount; $j++)
131				{
132					$tSplit = StringHelper::str_ireplace($charMatches[0][$j], '', $terms[$i], false);
133
134					if ((bool) $tSplit)
135					{
136						$terms[$i] = $tSplit;
137					}
138					else
139					{
140						unset($terms[$i]);
141					}
142
143					$terms[] = $charMatches[0][$j];
144				}
145			}
146
147			// Reset array keys.
148			$terms = array_values($terms);
149		}
150
151		/*
152		 * If we have to handle the input as a phrase, that means we don't
153		 * tokenize the individual terms and we do not create the two and three
154		 * term combinations. The phrase must contain more than one word!
155		 */
156		if ($phrase === true && count($terms) > 1)
157		{
158			// Create tokens from the phrase.
159			$tokens[] = new FinderIndexerToken($terms, $lang);
160		}
161		else
162		{
163			// Create tokens from the terms.
164			for ($i = 0, $n = count($terms); $i < $n; $i++)
165			{
166				$tokens[] = new FinderIndexerToken($terms[$i], $lang);
167			}
168
169			// Create two and three word phrase tokens from the individual words.
170			for ($i = 0, $n = count($tokens); $i < $n; $i++)
171			{
172				// Setup the phrase positions.
173				$i2 = $i + 1;
174				$i3 = $i + 2;
175
176				// Create the two word phrase.
177				if ($i2 < $n && isset($tokens[$i2]))
178				{
179					// Tokenize the two word phrase.
180					$token          = new FinderIndexerToken(
181						array(
182							$tokens[$i]->term,
183							$tokens[$i2]->term
184						), $lang, $lang === 'zh' ? '' : ' '
185					);
186					$token->derived = true;
187
188					// Add the token to the stack.
189					$tokens[] = $token;
190				}
191
192				// Create the three word phrase.
193				if ($i3 < $n && isset($tokens[$i3]))
194				{
195					// Tokenize the three word phrase.
196					$token          = new FinderIndexerToken(
197						array(
198							$tokens[$i]->term,
199							$tokens[$i2]->term,
200							$tokens[$i3]->term
201						), $lang, $lang === 'zh' ? '' : ' '
202					);
203					$token->derived = true;
204
205					// Add the token to the stack.
206					$tokens[] = $token;
207				}
208			}
209		}
210
211		if ($store)
212		{
213			$cache[$store] = count($tokens) > 1 ? $tokens : array_shift($tokens);
214
215			return $cache[$store];
216		}
217		else
218		{
219			return count($tokens) > 1 ? $tokens : array_shift($tokens);
220		}
221	}
222
223	/**
224	 * Method to get the base word of a token. This method uses the public
225	 * {@link FinderIndexerHelper::$stemmer} object if it is set. If no stemmer is set,
226	 * the original token is returned.
227	 *
228	 * @param   string  $token  The token to stem.
229	 * @param   string  $lang   The language of the token.
230	 *
231	 * @return  string  The root token.
232	 *
233	 * @since   2.5
234	 */
235	public static function stem($token, $lang)
236	{
237		// Trim apostrophes at either end of the token.
238		$token = trim($token, '\'');
239
240		// Trim everything after any apostrophe in the token.
241		if ($res = explode('\'', $token))
242		{
243			$token = $res[0];
244		}
245
246		if (static::$stemmerOK === true)
247		{
248			return static::$stemmer->stem($token, $lang);
249		}
250		else
251		{
252			// Stem the token if we have a valid stemmer to use.
253			if (static::$stemmer instanceof FinderIndexerStemmer)
254			{
255				static::$stemmerOK = true;
256
257				return static::$stemmer->stem($token, $lang);
258			}
259		}
260
261		return $token;
262	}
263
264	/**
265	 * Method to add a content type to the database.
266	 *
267	 * @param   string  $title  The type of content. For example: PDF
268	 * @param   string  $mime   The mime type of the content. For example: PDF [optional]
269	 *
270	 * @return  integer  The id of the content type.
271	 *
272	 * @since   2.5
273	 * @throws  Exception on database error.
274	 */
275	public static function addContentType($title, $mime = null)
276	{
277		static $types;
278
279		$db    = JFactory::getDbo();
280		$query = $db->getQuery(true);
281
282		// Check if the types are loaded.
283		if (empty($types))
284		{
285			// Build the query to get the types.
286			$query->select('*')
287				->from($db->quoteName('#__finder_types'));
288
289			// Get the types.
290			$db->setQuery($query);
291			$types = $db->loadObjectList('title');
292		}
293
294		// Check if the type already exists.
295		if (isset($types[$title]))
296		{
297			return (int) $types[$title]->id;
298		}
299
300		// Add the type.
301		$query->clear()
302			->insert($db->quoteName('#__finder_types'))
303			->columns(array($db->quoteName('title'), $db->quoteName('mime')))
304			->values($db->quote($title) . ', ' . $db->quote($mime));
305		$db->setQuery($query);
306		$db->execute();
307
308		// Return the new id.
309		return (int) $db->insertid();
310	}
311
312	/**
313	 * Method to check if a token is common in a language.
314	 *
315	 * @param   string  $token  The token to test.
316	 * @param   string  $lang   The language to reference.
317	 *
318	 * @return  boolean  True if common, false otherwise.
319	 *
320	 * @since   2.5
321	 */
322	public static function isCommon($token, $lang)
323	{
324		static $data;
325		static $default;
326
327		$langCode = $lang;
328
329		// If language requested is wildcard, use the default language.
330		if ($default === null && $lang === '*')
331		{
332			$default = strstr(self::getDefaultLanguage(), '-', true);
333			$langCode = $default;
334		}
335
336		// Load the common tokens for the language if necessary.
337		if (!isset($data[$langCode]))
338		{
339			$data[$langCode] = self::getCommonWords($langCode);
340		}
341
342		// Check if the token is in the common array.
343		return in_array($token, $data[$langCode], true);
344	}
345
346	/**
347	 * Method to get an array of common terms for a language.
348	 *
349	 * @param   string  $lang  The language to use.
350	 *
351	 * @return  array  Array of common terms.
352	 *
353	 * @since   2.5
354	 * @throws  Exception on database error.
355	 */
356	public static function getCommonWords($lang)
357	{
358		$db = JFactory::getDbo();
359
360		// Create the query to load all the common terms for the language.
361		$query = $db->getQuery(true)
362			->select($db->quoteName('term'))
363			->from($db->quoteName('#__finder_terms_common'))
364			->where($db->quoteName('language') . ' = ' . $db->quote($lang));
365
366		// Load all of the common terms for the language.
367		$db->setQuery($query);
368
369		return $db->loadColumn();
370	}
371
372	/**
373	 * Method to get the default language for the site.
374	 *
375	 * @return  string  The default language string.
376	 *
377	 * @since   2.5
378	 */
379	public static function getDefaultLanguage()
380	{
381		static $lang;
382
383		// We need to go to com_languages to get the site default language, it's the best we can guess.
384		if (empty($lang))
385		{
386			$lang = JComponentHelper::getParams('com_languages')->get('site', 'en-GB');
387		}
388
389		return $lang;
390	}
391
392	/**
393	 * Method to parse a language/locale key and return a simple language string.
394	 *
395	 * @param   string  $lang  The language/locale key. For example: en-GB
396	 *
397	 * @return  string  The simple language string. For example: en
398	 *
399	 * @since   2.5
400	 */
401	public static function getPrimaryLanguage($lang)
402	{
403		static $data;
404
405		// Only parse the identifier if necessary.
406		if (!isset($data[$lang]))
407		{
408			if (is_callable(array('Locale', 'getPrimaryLanguage')))
409			{
410				// Get the language key using the Locale package.
411				$data[$lang] = Locale::getPrimaryLanguage($lang);
412			}
413			else
414			{
415				// Get the language key using string position.
416				$data[$lang] = StringHelper::substr($lang, 0, StringHelper::strpos($lang, '-'));
417			}
418		}
419
420		return $data[$lang];
421	}
422
423	/**
424	 * Method to get the path (SEF route) for a content item.
425	 *
426	 * @param   string  $url  The non-SEF route to the content item.
427	 *
428	 * @return  string  The path for the content item.
429	 *
430	 * @since       2.5
431	 * @deprecated  4.0
432	 */
433	public static function getContentPath($url)
434	{
435		static $router;
436
437		// Only get the router once.
438		if (!($router instanceof JRouter))
439		{
440			// Get and configure the site router.
441			$config = JFactory::getConfig();
442			$router = JRouter::getInstance('site');
443			$router->setMode($config->get('sef', 1));
444		}
445
446		// Build the relative route.
447		$uri   = $router->build($url);
448		$route = $uri->toString(array('path', 'query', 'fragment'));
449		$route = str_replace(JUri::base(true) . '/', '', $route);
450
451		return $route;
452	}
453
454	/**
455	 * Method to get extra data for a content before being indexed. This is how
456	 * we add Comments, Tags, Labels, etc. that should be available to Finder.
457	 *
458	 * @param   FinderIndexerResult  $item  The item to index as a FinderIndexerResult object.
459	 *
460	 * @return  boolean  True on success, false on failure.
461	 *
462	 * @since   2.5
463	 * @throws  Exception on database error.
464	 */
465	public static function getContentExtras(FinderIndexerResult $item)
466	{
467		// Get the event dispatcher.
468		$dispatcher = JEventDispatcher::getInstance();
469
470		// Load the finder plugin group.
471		JPluginHelper::importPlugin('finder');
472
473		// Trigger the event.
474		$results = $dispatcher->trigger('onPrepareFinderContent', array(&$item));
475
476		// Check the returned results. This is for plugins that don't throw
477		// exceptions when they encounter serious errors.
478		if (in_array(false, $results))
479		{
480			throw new Exception($dispatcher->getError(), 500);
481		}
482
483		return true;
484	}
485
486	/**
487	 * Method to process content text using the onContentPrepare event trigger.
488	 *
489	 * @param   string               $text    The content to process.
490	 * @param   Registry             $params  The parameters object. [optional]
491	 * @param   FinderIndexerResult  $item    The item which get prepared. [optional]
492	 *
493	 * @return  string  The processed content.
494	 *
495	 * @since   2.5
496	 */
497	public static function prepareContent($text, $params = null, FinderIndexerResult $item = null)
498	{
499		static $loaded;
500
501		// Get the dispatcher.
502		$dispatcher = JEventDispatcher::getInstance();
503
504		// Load the content plugins if necessary.
505		if (empty($loaded))
506		{
507			JPluginHelper::importPlugin('content');
508			$loaded = true;
509		}
510
511		// Instantiate the parameter object if necessary.
512		if (!($params instanceof Registry))
513		{
514			$registry = new Registry($params);
515			$params = $registry;
516		}
517
518		// Create a mock content object.
519		$content       = JTable::getInstance('Content');
520		$content->text = $text;
521
522		if ($item)
523		{
524			$content->bind((array) $item);
525			$content->bind($item->getElements());
526		}
527
528		if ($item && !empty($item->context))
529		{
530			$content->context = $item->context;
531		}
532
533		// Fire the onContentPrepare event.
534		$dispatcher->trigger('onContentPrepare', array('com_finder.indexer', &$content, &$params, 0));
535
536		return $content->text;
537	}
538}
539