1<?php
2/**
3 * @package     Joomla.Administrator
4 * @subpackage  com_finder
5 *
6 * @copyright   Copyright (C) 2005 - 2020 Open Source Matters, Inc. All rights reserved.
7 * @license     GNU General Public License version 2 or later; see LICENSE.txt
8 */
9
10defined('_JEXEC') or die;
11
12use Joomla\String\StringHelper;
13
14JLoader::register('FinderIndexerHelper', __DIR__ . '/helper.php');
15JLoader::register('FinderIndexerParser', __DIR__ . '/parser.php');
16JLoader::register('FinderIndexerStemmer', __DIR__ . '/stemmer.php');
17JLoader::register('FinderIndexerTaxonomy', __DIR__ . '/taxonomy.php');
18JLoader::register('FinderIndexerToken', __DIR__ . '/token.php');
19
20jimport('joomla.filesystem.file');
21
22/**
23 * Main indexer class for the Finder indexer package.
24 *
25 * The indexer class provides the core functionality of the Finder
26 * search engine. It is responsible for adding and updating the
27 * content links table; extracting and scoring tokens; and maintaining
28 * all referential information for the content.
29 *
30 * Note: All exceptions thrown from within this class should be caught
31 * by the controller.
32 *
33 * @since  2.5
34 */
35abstract class FinderIndexer
36{
37	/**
38	 * The title context identifier.
39	 *
40	 * @var    integer
41	 * @since  2.5
42	 */
43	const TITLE_CONTEXT = 1;
44
45	/**
46	 * The text context identifier.
47	 *
48	 * @var    integer
49	 * @since  2.5
50	 */
51	const TEXT_CONTEXT = 2;
52
53	/**
54	 * The meta context identifier.
55	 *
56	 * @var    integer
57	 * @since  2.5
58	 */
59	const META_CONTEXT = 3;
60
61	/**
62	 * The path context identifier.
63	 *
64	 * @var    integer
65	 * @since  2.5
66	 */
67	const PATH_CONTEXT = 4;
68
69	/**
70	 * The misc context identifier.
71	 *
72	 * @var    integer
73	 * @since  2.5
74	 */
75	const MISC_CONTEXT = 5;
76
77	/**
78	 * The indexer state object.
79	 *
80	 * @var    JObject
81	 * @since  2.5
82	 */
83	public static $state;
84
85	/**
86	 * The indexer profiler object.
87	 *
88	 * @var    JProfiler
89	 * @since  2.5
90	 */
91	public static $profiler;
92
93	/**
94	 * Database driver cache.
95	 *
96	 * @var    JDatabaseDriver
97	 * @since  3.8.0
98	 */
99	protected $db;
100
101	/**
102	 * Reusable Query Template. To be used with clone.
103	 *
104	 * @var    JDatabaseQuery
105	 * @since  3.8.0
106	 */
107	protected $addTokensToDbQueryTemplate;
108
109	/**
110	 * FinderIndexer constructor.
111	 *
112	 * @since  3.8.0
113	 */
114	public function __construct()
115	{
116		$this->db = JFactory::getDbo();
117
118		$db = $this->db;
119
120		/**
121		 * Set up query template for addTokensToDb, we will be cloning this template when needed.
122		 * This is about twice as fast as calling the clear function or setting up a new object.
123		 */
124		$this->addTokensToDbQueryTemplate = $db->getQuery(true)->insert($db->quoteName('#__finder_tokens'))
125			->columns(
126				array(
127					$db->quoteName('term'),
128					$db->quoteName('stem'),
129					$db->quoteName('common'),
130					$db->quoteName('phrase'),
131					$db->quoteName('weight'),
132					$db->quoteName('context'),
133					$db->quoteName('language')
134				)
135			);
136	}
137
138	/**
139	 * Returns a reference to the FinderIndexer object.
140	 *
141	 * @return  FinderIndexer instance based on the database driver
142	 *
143	 * @since   3.0
144	 * @throws  RuntimeException if driver class for indexer not present.
145	 */
146	public static function getInstance()
147	{
148		// Setup the adapter for the indexer.
149		$serverType = JFactory::getDbo()->getServerType();
150
151		// For `mssql` server types, convert the type to `sqlsrv`
152		if ($serverType === 'mssql')
153		{
154			$serverType = 'sqlsrv';
155		}
156
157		$path = __DIR__ . '/driver/' . $serverType . '.php';
158		$class = 'FinderIndexerDriver' . ucfirst($serverType);
159
160		// Check if a parser exists for the format.
161		if (file_exists($path))
162		{
163			// Instantiate the parser.
164			JLoader::register($class, $path);
165
166			return new $class;
167		}
168
169		// Throw invalid format exception.
170		throw new RuntimeException(JText::sprintf('COM_FINDER_INDEXER_INVALID_DRIVER', $serverType));
171	}
172
173	/**
174	 * Method to get the indexer state.
175	 *
176	 * @return  object  The indexer state object.
177	 *
178	 * @since   2.5
179	 */
180	public static function getState()
181	{
182		// First, try to load from the internal state.
183		if ((bool) static::$state)
184		{
185			return static::$state;
186		}
187
188		// If we couldn't load from the internal state, try the session.
189		$session = JFactory::getSession();
190		$data = $session->get('_finder.state', null);
191
192		// If the state is empty, load the values for the first time.
193		if (empty($data))
194		{
195			$data = new JObject;
196
197			// Load the default configuration options.
198			$data->options = JComponentHelper::getParams('com_finder');
199
200			// Setup the weight lookup information.
201			$data->weights = array(
202				self::TITLE_CONTEXT => round($data->options->get('title_multiplier', 1.7), 2),
203				self::TEXT_CONTEXT  => round($data->options->get('text_multiplier', 0.7), 2),
204				self::META_CONTEXT  => round($data->options->get('meta_multiplier', 1.2), 2),
205				self::PATH_CONTEXT  => round($data->options->get('path_multiplier', 2.0), 2),
206				self::MISC_CONTEXT  => round($data->options->get('misc_multiplier', 0.3), 2)
207			);
208
209			// Set the current time as the start time.
210			$data->startTime = JFactory::getDate()->toSql();
211
212			// Set the remaining default values.
213			$data->batchSize   = (int) $data->options->get('batch_size', 50);
214			$data->batchOffset = 0;
215			$data->totalItems  = 0;
216			$data->pluginState = array();
217		}
218
219		// Setup the profiler if debugging is enabled.
220		if (JFactory::getApplication()->get('debug'))
221		{
222			static::$profiler = JProfiler::getInstance('FinderIndexer');
223		}
224
225		// Setup the stemmer.
226		if ($data->options->get('stem', 1) && $data->options->get('stemmer', 'porter_en'))
227		{
228			FinderIndexerHelper::$stemmer = FinderIndexerStemmer::getInstance($data->options->get('stemmer', 'porter_en'));
229		}
230
231		// Set the state.
232		static::$state = $data;
233
234		return static::$state;
235	}
236
237	/**
238	 * Method to set the indexer state.
239	 *
240	 * @param   object  $data  A new indexer state object.
241	 *
242	 * @return  boolean  True on success, false on failure.
243	 *
244	 * @since   2.5
245	 */
246	public static function setState($data)
247	{
248		// Check the state object.
249		if (empty($data) || !$data instanceof JObject)
250		{
251			return false;
252		}
253
254		// Set the new internal state.
255		static::$state = $data;
256
257		// Set the new session state.
258		JFactory::getSession()->set('_finder.state', $data);
259
260		return true;
261	}
262
263	/**
264	 * Method to reset the indexer state.
265	 *
266	 * @return  void
267	 *
268	 * @since   2.5
269	 */
270	public static function resetState()
271	{
272		// Reset the internal state to null.
273		self::$state = null;
274
275		// Reset the session state to null.
276		JFactory::getSession()->set('_finder.state', null);
277	}
278
279	/**
280	 * Method to index a content item.
281	 *
282	 * @param   FinderIndexerResult  $item    The content item to index.
283	 * @param   string               $format  The format of the content. [optional]
284	 *
285	 * @return  integer  The ID of the record in the links table.
286	 *
287	 * @since   2.5
288	 * @throws  Exception on database error.
289	 */
290	abstract public function index($item, $format = 'html');
291
292	/**
293	 * Method to remove a link from the index.
294	 *
295	 * @param   integer  $linkId  The id of the link.
296	 *
297	 * @return  boolean  True on success.
298	 *
299	 * @since   2.5
300	 * @throws  Exception on database error.
301	 */
302	public function remove($linkId)
303	{
304		$db    = $this->db;
305		$query = $db->getQuery(true);
306
307		// Update the link counts and remove the mapping records.
308		for ($i = 0; $i <= 15; $i++)
309		{
310			// Update the link counts for the terms.
311			$query->clear()
312				->update($db->quoteName('#__finder_terms', 't'))
313				->join('INNER', $db->quoteName('#__finder_links_terms' . dechex($i), 'm') .
314					' ON ' . $db->quoteName('m.term_id') . ' = ' . $db->quoteName('t.term_id')
315				)
316				->set($db->quoteName('links') . ' = ' . $db->quoteName('links') . ' - 1')
317				->where($db->quoteName('m.link_id') . ' = ' . (int) $linkId);
318			$db->setQuery($query)->execute();
319
320			// Remove all records from the mapping tables.
321			$query->clear()
322				->delete($db->quoteName('#__finder_links_terms' . dechex($i)))
323				->where($db->quoteName('link_id') . ' = ' . (int) $linkId);
324			$db->setQuery($query)->execute();
325		}
326
327		// Delete all orphaned terms.
328		$query->clear()
329			->delete($db->quoteName('#__finder_terms'))
330			->where($db->quoteName('links') . ' <= 0');
331		$db->setQuery($query)->execute();
332
333		// Delete the link from the index.
334		$query->clear()
335			->delete($db->quoteName('#__finder_links'))
336			->where($db->quoteName('link_id') . ' = ' . (int) $linkId);
337		$db->setQuery($query)->execute();
338
339		// Remove the taxonomy maps.
340		FinderIndexerTaxonomy::removeMaps($linkId);
341
342		// Remove the orphaned taxonomy nodes.
343		FinderIndexerTaxonomy::removeOrphanNodes();
344
345		return true;
346	}
347
348	/**
349	 * Method to optimize the index. We use this method to remove unused terms
350	 * and any other optimizations that might be necessary.
351	 *
352	 * @return  boolean  True on success.
353	 *
354	 * @since   2.5
355	 * @throws  Exception on database error.
356	 */
357	abstract public function optimize();
358
359	/**
360	 * Method to get a content item's signature.
361	 *
362	 * @param   object  $item  The content item to index.
363	 *
364	 * @return  string  The content item's signature.
365	 *
366	 * @since   2.5
367	 */
368	protected static function getSignature($item)
369	{
370		// Get the indexer state.
371		$state = static::getState();
372
373		// Get the relevant configuration variables.
374		$config = array(
375			$state->weights,
376			$state->options->get('stem', 1),
377			$state->options->get('stemmer', 'porter_en')
378		);
379
380		return md5(serialize(array($item, $config)));
381	}
382
383	/**
384	 * Method to parse input, tokenize it, and then add it to the database.
385	 *
386	 * @param   mixed    $input    String or resource to use as input. A resource input will automatically be chunked to conserve
387	 *                             memory. Strings will be chunked if longer than 2K in size.
388	 * @param   integer  $context  The context of the input. See context constants.
389	 * @param   string   $lang     The language of the input.
390	 * @param   string   $format   The format of the input.
391	 *
392	 * @return  integer  The number of tokens extracted from the input.
393	 *
394	 * @since   2.5
395	 */
396	protected function tokenizeToDb($input, $context, $lang, $format)
397	{
398		$count = 0;
399		$buffer = null;
400
401		if (empty($input))
402		{
403			return $count;
404		}
405
406		// If the input is a resource, batch the process out.
407		if (is_resource($input))
408		{
409			// Batch the process out to avoid memory limits.
410			while (!feof($input))
411			{
412				// Read into the buffer.
413				$buffer .= fread($input, 2048);
414
415				/*
416				 * If we haven't reached the end of the file, seek to the last
417				 * space character and drop whatever is after that to make sure
418				 * we didn't truncate a term while reading the input.
419				 */
420				if (!feof($input))
421				{
422					// Find the last space character.
423					$ls = strrpos($buffer, ' ');
424
425					// Adjust string based on the last space character.
426					if ($ls)
427					{
428						// Truncate the string to the last space character.
429						$string = substr($buffer, 0, $ls);
430
431						// Adjust the buffer based on the last space for the next iteration and trim.
432						$buffer = StringHelper::trim(substr($buffer, $ls));
433					}
434					// No space character was found.
435					else
436					{
437						$string = $buffer;
438					}
439				}
440				// We've reached the end of the file, so parse whatever remains.
441				else
442				{
443					$string = $buffer;
444				}
445
446				// Parse, tokenise and add tokens to the database.
447				$count = $this->tokenizeToDbShort($string, $context, $lang, $format, $count);
448
449				unset($string, $tokens);
450			}
451
452			return $count;
453		}
454
455		// Parse, tokenise and add tokens to the database.
456		$count = $this->tokenizeToDbShort($input, $context, $lang, $format, $count);
457
458		return $count;
459	}
460
461	/**
462	 * Method to parse input, tokenise it, then add the tokens to the database.
463	 *
464	 * @param   string   $input    String to parse, tokenise and add to database.
465	 * @param   integer  $context  The context of the input. See context constants.
466	 * @param   string   $lang     The language of the input.
467	 * @param   string   $format   The format of the input.
468	 * @param   integer  $count    The number of tokens processed so far.
469	 *
470	 * @return  integer  Cumulative number of tokens extracted from the input so far.
471	 *
472	 * @since   3.7.0
473	 */
474	private function tokenizeToDbShort($input, $context, $lang, $format, $count)
475	{
476		// Parse the input.
477		$input = FinderIndexerHelper::parse($input, $format);
478
479		// Check the input.
480		if (empty($input))
481		{
482			return $count;
483		}
484
485		// Tokenize the input.
486		$tokens = FinderIndexerHelper::tokenize($input, $lang);
487
488		// Add the tokens to the database.
489		$count += $this->addTokensToDb($tokens, $context);
490
491		// Check if we're approaching the memory limit of the token table.
492		if ($count > static::$state->options->get('memory_table_limit', 30000))
493		{
494			$this->toggleTables(false);
495		}
496
497		return $count;
498	}
499
500	/**
501	 * Method to add a set of tokens to the database.
502	 *
503	 * @param   mixed  $tokens   An array or single FinderIndexerToken object.
504	 * @param   mixed  $context  The context of the tokens. See context constants. [optional]
505	 *
506	 * @return  integer  The number of tokens inserted into the database.
507	 *
508	 * @since   2.5
509	 * @throws  Exception on database error.
510	 */
511	protected function addTokensToDb($tokens, $context = '')
512	{
513		// Get the database object.
514		$db = $this->db;
515
516		// Count the number of token values.
517		$values = 0;
518
519		if (($tokens instanceof FinderIndexerToken) === false)
520		{
521			// Break into chunks of no more than 1000 items
522			$chunks = count($tokens) > 1000
523				? array_chunk($tokens, 1000)
524				: array($tokens);
525
526			foreach ($chunks as $chunkTokens)
527			{
528				// Cloning a new query template is twice as fast as calling the clear function
529				$query = clone $this->addTokensToDbQueryTemplate;
530
531				// Iterate through the tokens to create SQL value sets.
532				foreach ($chunkTokens as $token)
533				{
534					$query->values(
535						$db->quote($token->term) . ', '
536						. $db->quote($token->stem) . ', '
537						. (int) $token->common . ', '
538						. (int) $token->phrase . ', '
539						. $db->escape((float) $token->weight) . ', '
540						. (int) $context . ', '
541						. $db->quote($token->language)
542					);
543					++$values;
544				}
545
546				$db->setQuery($query)->execute();
547			}
548		}
549		else
550		{
551			$query = clone $this->addTokensToDbQueryTemplate;
552
553			$query->values(
554				$db->quote($tokens->term) . ', '
555				. $db->quote($tokens->stem) . ', '
556				. (int) $tokens->common . ', '
557				. (int) $tokens->phrase . ', '
558				. $db->escape((float) $tokens->weight) . ', '
559				. (int) $context . ', '
560				. $db->quote($tokens->language)
561			);
562			++$values;
563
564			$db->setQuery($query)->execute();
565		}
566
567		return $values;
568	}
569
570	/**
571	 * Method to switch the token tables from Memory tables to Disk tables
572	 * when they are close to running out of memory.
573	 * Since this is not supported/implemented in all DB-drivers, the default is a stub method, which simply returns true.
574	 *
575	 * @param   boolean  $memory  Flag to control how they should be toggled.
576	 *
577	 * @return  boolean  True on success.
578	 *
579	 * @since   2.5
580	 * @throws  Exception on database error.
581	 */
582	protected function toggleTables($memory)
583	{
584		return true;
585	}
586}
587