1<?php
2
3/**
4 * An alternative abstract class for bridges utilizing XPath expressions
5 *
6 * This class is meant as an alternative base class for bridge implementations.
7 * It offers preliminary functionality for generating feeds based on XPath
8 * expressions.
9 * As a minimum, extending classes should define XPath expressions pointing
10 * to the feed items contents in the class constants below. In case there is
11 * more manual fine tuning required, it offers a bunch of methods which can
12 * be overridden, for example in order to specify formatting of field values
13 * or more flexible definition of dynamic XPath expressions.
14 *
15 * This class extends {@see BridgeAbstract}, which means it incorporates and
16 * extends all of its functionality.
17 **/
18abstract class XPathAbstract extends BridgeAbstract {
19
20	/**
21	 * Source Web page URL (should provide either HTML or XML content)
22	 * You can specify any website URL which serves data suited for display in RSS feeds
23	 * (for example a news blog).
24	 *
25	 * Use {@see XPathAbstract::getSourceUrl()} to read this parameter
26	 */
27	const FEED_SOURCE_URL = '';
28
29	/**
30	 * XPath expression for extracting the feed title from the source page.
31	 * If this is left blank or does not provide any data {@see BridgeAbstract::getName()}
32	 * is used instead as the feed's title.
33	 *
34	 * Use {@see XPathAbstract::getExpressionTitle()} to read this parameter
35	 */
36	const XPATH_EXPRESSION_FEED_TITLE = './/title';
37
38	/**
39	 * XPath expression for extracting the feed favicon URL from the source page.
40	 * If this is left blank or does not provide any data {@see BridgeAbstract::getIcon()}
41	 * is used instead as the feed's favicon URL.
42	 *
43	 * Use {@see XPathAbstract::getExpressionIcon()} to read this parameter
44	 */
45	const XPATH_EXPRESSION_FEED_ICON = './/link[@rel="icon"]/@href';
46
47	/**
48	 * XPath expression for extracting the feed items from the source page
49	 * Enter an XPath expression matching a list of dom nodes, each node containing one
50	 * feed article item in total (usually a surrounding <div> or <span> tag). This will
51	 * be the context nodes for all of the following expressions. This expression usually
52	 * starts with a single forward slash.
53	 *
54	 * Use {@see XPathAbstract::getExpressionItem()} to read this parameter
55	 */
56	const XPATH_EXPRESSION_ITEM = '';
57
58	/**
59	 * XPath expression for extracting an item title from the item context
60	 * This expression should match a node contained within each article item node
61	 * containing the article headline. It should start with a dot followed by two
62	 * forward slashes, referring to any descendant nodes of the article item node.
63	 *
64	 * Use {@see XPathAbstract::getExpressionItemTitle()} to read this parameter
65	 */
66	const XPATH_EXPRESSION_ITEM_TITLE = '';
67
68	/**
69	 * XPath expression for extracting an item's content from the item context
70	 * This expression should match a node contained within each article item node
71	 * containing the article content or description. It should start with a dot
72	 * followed by two forward slashes, referring to any descendant nodes of the
73	 * article item node.
74	 *
75	 * Use {@see XPathAbstract::getExpressionItemContent()} to read this parameter
76	 */
77	const XPATH_EXPRESSION_ITEM_CONTENT = '';
78
79	/**
80	 * XPath expression for extracting an item link from the item context
81	 * This expression should match a node's attribute containing the article URL
82	 * (usually the href attribute of an <a> tag). It should start with a dot
83	 * followed by two forward slashes, referring to any descendant nodes of
84	 * the article item node. Attributes can be selected by prepending an @ char
85	 * before the attributes name.
86	 *
87	 * Use {@see XPathAbstract::getExpressionItemUri()} to read this parameter
88	 */
89	const XPATH_EXPRESSION_ITEM_URI = '';
90
91	/**
92	 * XPath expression for extracting an item author from the item context
93	 * This expression should match a node contained within each article item
94	 * node containing the article author's name. It should start with a dot
95	 * followed by two forward slashes, referring to any descendant nodes of
96	 * the article item node.
97	 *
98	 * Use {@see XPathAbstract::getExpressionItemAuthor()} to read this parameter
99	 */
100	const XPATH_EXPRESSION_ITEM_AUTHOR = '';
101
102	/**
103	 * XPath expression for extracting an item timestamp from the item context
104	 * This expression should match a node or node's attribute containing the
105	 * article timestamp or date (parsable by PHP's strtotime function). It
106	 * should start with a dot followed by two forward slashes, referring to
107	 * any descendant nodes of the article item node. Attributes can be
108	 * selected by prepending an @ char before the attributes name.
109	 *
110	 * Use {@see XPathAbstract::getExpressionItemTimestamp()} to read this parameter
111	 */
112	const XPATH_EXPRESSION_ITEM_TIMESTAMP = '';
113
114	/**
115	 * XPath expression for extracting item enclosures (media content like
116	 * images or movies) from the item context
117	 * This expression should match a node's attribute containing an article
118	 * image URL (usually the src attribute of an <img> tag or a style
119	 * attribute). It should start with a dot followed by two forward slashes,
120	 * referring to any descendant nodes of the article item node. Attributes
121	 * can be selected by prepending an @ char before the attributes name.
122	 *
123	 * Use {@see XPathAbstract::getExpressionItemEnclosures()} to read this parameter
124	 */
125	const XPATH_EXPRESSION_ITEM_ENCLOSURES = '';
126
127	/**
128	 * XPath expression for extracting an item category from the item context
129	 * This expression should match a node or node's attribute contained
130	 * within each article item node containing the article category. This
131	 * could be inside <div> or <span> tags or sometimes be hidden
132	 * in a data attribute. It should start with a dot followed by two
133	 * forward slashes, referring to any descendant nodes of the article
134	 * item node. Attributes can be selected by prepending an @ char
135	 * before the attributes name.
136	 *
137	 * Use {@see XPathAbstract::getExpressionItemCategories()} to read this parameter
138	 */
139	const XPATH_EXPRESSION_ITEM_CATEGORIES = '';
140
141	/**
142	 * Fix encoding
143	 * Set this to true for fixing feed encoding by invoking PHP's utf8_decode
144	 * function on all extracted texts. Try this in case you see "broken" or
145	 * "weird" characters in your feed where you'd normally expect umlauts
146	 * or any other non-ascii characters.
147	 *
148	 * Use {@see XPathAbstract::getSettingFixEncoding()} to read this parameter
149	 */
150	const SETTING_FIX_ENCODING = false;
151
152	/**
153	 * Internal storage for resulting feed name, automatically detected
154	 * @var string
155	 */
156	private $feedName;
157
158	/**
159	 * Internal storage for resulting feed name, automatically detected
160	 * @var string
161	 */
162	private $feedUri;
163
164	/**
165	 * Internal storage for resulting feed favicon, automatically detected
166	 * @var string
167	 */
168	private $feedIcon;
169
170	public function getName(){
171		return $this->feedName ?: parent::getName();
172	}
173
174	public function getURI() {
175		return $this->feedUri ?: parent::getURI();
176	}
177
178	public function getIcon() {
179		return $this->feedIcon ?: parent::getIcon();
180	}
181
182	/**
183	 * Source Web page URL (should provide either HTML or XML content)
184	 * @return string
185	 */
186	protected function getSourceUrl(){
187		return static::FEED_SOURCE_URL;
188	}
189
190	/**
191	 * XPath expression for extracting the feed title from the source page
192	 * @return string
193	 */
194	protected function getExpressionTitle(){
195		return static::XPATH_EXPRESSION_FEED_TITLE;
196	}
197
198	/**
199	 * XPath expression for extracting the feed favicon from the source page
200	 * @return string
201	 */
202	protected function getExpressionIcon(){
203		return static::XPATH_EXPRESSION_FEED_ICON;
204	}
205
206	/**
207	 * XPath expression for extracting the feed items from the source page
208	 * @return string
209	 */
210	protected function getExpressionItem(){
211		return static::XPATH_EXPRESSION_ITEM;
212	}
213
214	/**
215	 * XPath expression for extracting an item title from the item context
216	 * @return string
217	 */
218	protected function getExpressionItemTitle(){
219		return static::XPATH_EXPRESSION_ITEM_TITLE;
220	}
221
222	/**
223	 * XPath expression for extracting an item's content from the item context
224	 * @return string
225	 */
226	protected function getExpressionItemContent(){
227		return static::XPATH_EXPRESSION_ITEM_CONTENT;
228	}
229
230	/**
231	 * XPath expression for extracting an item link from the item context
232	 * @return string
233	 */
234	protected function getExpressionItemUri(){
235		return static::XPATH_EXPRESSION_ITEM_URI;
236	}
237
238	/**
239	 * XPath expression for extracting an item author from the item context
240	 * @return string
241	 */
242	protected function getExpressionItemAuthor(){
243		return static::XPATH_EXPRESSION_ITEM_AUTHOR;
244	}
245
246	/**
247	 * XPath expression for extracting an item timestamp from the item context
248	 * @return string
249	 */
250	protected function getExpressionItemTimestamp(){
251		return static::XPATH_EXPRESSION_ITEM_TIMESTAMP;
252	}
253
254	/**
255	 * XPath expression for extracting item enclosures (media content like
256	 * images or movies) from the item context
257	 * @return string
258	 */
259	protected function getExpressionItemEnclosures(){
260		return static::XPATH_EXPRESSION_ITEM_ENCLOSURES;
261	}
262
263	/**
264	 * XPath expression for extracting an item category from the item context
265	 * @return string
266	 */
267	protected function getExpressionItemCategories(){
268		return static::XPATH_EXPRESSION_ITEM_CATEGORIES;
269	}
270
271	/**
272	 * Fix encoding
273	 * @return string
274	 */
275	protected function getSettingFixEncoding(){
276		return static::SETTING_FIX_ENCODING;
277	}
278
279	/**
280	 * Internal helper method for quickly accessing all the user defined constants
281	 * in derived classes
282	 *
283	 * @param $name
284	 * @return bool|string
285	 */
286	private function getParam($name){
287		switch($name) {
288
289			case 'url':
290				return $this->getSourceUrl();
291			case 'feed_title':
292				return $this->getExpressionTitle();
293			case 'feed_icon':
294				return $this->getExpressionIcon();
295			case 'item':
296				return $this->getExpressionItem();
297			case 'title':
298				return $this->getExpressionItemTitle();
299			case 'content':
300				return $this->getExpressionItemContent();
301			case 'uri':
302				return $this->getExpressionItemUri();
303			case 'author':
304				return $this->getExpressionItemAuthor();
305			case 'timestamp':
306				return $this->getExpressionItemTimestamp();
307			case 'enclosures':
308				return $this->getExpressionItemEnclosures();
309			case 'categories':
310				return $this->getExpressionItemCategories();
311			case 'fix_encoding':
312				return $this->getSettingFixEncoding();
313		}
314	}
315
316	/**
317	 * Should provide the source website HTML content
318	 * can be easily overwritten for example if special headers or auth infos are required
319	 * @return string
320	 */
321	protected function provideWebsiteContent() {
322		return getContents($this->feedUri);
323	}
324
325	/**
326	 * Should provide the feeds title
327	 *
328	 * @param DOMXPath $xpath
329	 * @return string
330	 */
331	protected function provideFeedTitle(DOMXPath $xpath) {
332		$title = $xpath->query($this->getParam('feed_title'));
333		if(count($title) === 1) {
334			return $this->getItemValueOrNodeValue($title);
335		}
336	}
337
338	/**
339	 * Should provide the URL of the feed's favicon
340	 *
341	 * @param DOMXPath $xpath
342	 * @return string
343	 */
344	protected function provideFeedIcon(DOMXPath $xpath) {
345		$icon = $xpath->query($this->getParam('feed_icon'));
346		if(count($icon) === 1) {
347			return $this->cleanImageUrl($this->getItemValueOrNodeValue($icon));
348		}
349	}
350
351	/**
352	 * Should provide the feed's items.
353	 *
354	 * @param DOMXPath $xpath
355	 * @return DOMNodeList
356	 */
357	protected function provideFeedItems(DOMXPath $xpath) {
358		return @$xpath->query($this->getParam('item'));
359	}
360
361	public function collectData() {
362
363		$this->feedUri = $this->getParam('url');
364
365		$webPageHtml = new DOMDocument();
366		libxml_use_internal_errors(true);
367		$webPageHtml->loadHTML($this->provideWebsiteContent());
368		libxml_clear_errors();
369		libxml_use_internal_errors(false);
370
371		$xpath = new DOMXPath($webPageHtml);
372
373		$this->feedName = $this->provideFeedTitle($xpath);
374		$this->feedIcon = $this->provideFeedIcon($xpath);
375
376		$entries = $this->provideFeedItems($xpath);
377		if($entries === false) {
378			return;
379		}
380
381		foreach ($entries as $entry) {
382			$item = new \FeedItem();
383			foreach(array('title', 'content', 'uri', 'author', 'timestamp', 'enclosures', 'categories') as $param) {
384
385				$expression = $this->getParam($param);
386				if('' === $expression) {
387					continue;
388				}
389
390				//can be a string or DOMNodeList, depending on the expression result
391				$typedResult = @$xpath->evaluate($expression, $entry);
392				if ($typedResult === false || ($typedResult instanceof DOMNodeList && count($typedResult) === 0)
393					|| (is_string($typedResult) && strlen(trim($typedResult)) === 0)) {
394					continue;
395				}
396
397				$item->__set($param, $this->formatParamValue($param, $this->getItemValueOrNodeValue($typedResult)));
398
399			}
400
401			$itemId = $this->generateItemId($item);
402			if(null !== $itemId) {
403				$item->setUid($itemId);
404			}
405
406			$this->items[] = $item;
407		}
408
409	}
410
411	/**
412	 * @param $param
413	 * @param $value
414	 * @return string|array
415	 */
416	protected function formatParamValue($param, $value)
417	{
418		$value = $this->fixEncoding($value);
419		switch ($param) {
420			case 'title':
421				return $this->formatItemTitle($value);
422			case 'content':
423				return $this->formatItemContent($value);
424			case 'uri':
425				return $this->formatItemUri($value);
426			case 'author':
427				return $this->formatItemAuthor($value);
428			case 'timestamp':
429				return $this->formatItemTimestamp($value);
430			case 'enclosures':
431				return array($this->cleanImageUrl($value));
432			case 'categories':
433				return array($this->fixEncoding($value));
434		}
435		return $value;
436	}
437
438	/**
439	 * Formats the title of a feed item. Takes extracted raw title and returns it formatted
440	 * as string.
441	 * Can be easily overwritten for in case the value needs to be transformed into something
442	 * else.
443	 * @param string $value
444	 * @return string
445	 */
446	protected function formatItemTitle($value) {
447		return $value;
448	}
449
450	/**
451	 * Formats the timestamp of a feed item. Takes extracted raw timestamp and returns unix
452	 * timestamp as integer.
453	 * Can be easily overwritten for example if a special format has to be expected on the
454	 * source website.
455	 * @param string $value
456	 * @return string
457	 */
458	protected function formatItemContent($value) {
459		return $value;
460	}
461
462	/**
463	 * Formats the URI of a feed item. Takes extracted raw URI and returns it formatted
464	 * as string.
465	 * Can be easily overwritten for in case the value needs to be transformed into something
466	 * else.
467	 * @param string $value
468	 * @return string
469	 */
470	protected function formatItemUri($value) {
471		if(strlen($value) === 0) {
472			return '';
473		}
474		if(strpos($value, 'http://') === 0 || strpos($value, 'https://') === 0) {
475			return $value;
476		}
477
478		return urljoin($this->feedUri, $value);
479	}
480
481	/**
482	 * Formats the author of a feed item. Takes extracted raw author and returns it formatted
483	 * as string.
484	 * Can be easily overwritten for in case the value needs to be transformed into something
485	 * else.
486	 * @param string $value
487	 * @return string
488	 */
489	protected function formatItemAuthor($value) {
490		return $value;
491	}
492
493	/**
494	 * Formats the timestamp of a feed item. Takes extracted raw timestamp and returns unix
495	 * timestamp as integer.
496	 * Can be easily overwritten for example if a special format has to be expected on the
497	 * source website.
498	 * @param string $value
499	 * @return false|int
500	 */
501	protected function formatItemTimestamp($value) {
502		return strtotime($value);
503	}
504
505	/**
506	 * Formats the enclosures of a feed item. Takes extracted raw enclosures and returns them
507	 * formatted as array.
508	 * Can be easily overwritten for in case the values need to be transformed into something
509	 * else.
510	 * @param string $value
511	 * @return array
512	 */
513	protected function formatItemEnclosures($value) {
514		return array($this->cleanImageUrl($value));
515	}
516
517	/**
518	 * Formats the categories of a feed item. Takes extracted raw categories and returns them
519	 * formatted as array.
520	 * Can be easily overwritten for in case the values need to be transformed into something
521	 * else.
522	 * @param string $value
523	 * @return array
524	 */
525	protected function formatItemCategories($value) {
526		return array($value);
527	}
528
529	/**
530	 * @param $imageUrl
531	 * @return string|void
532	 */
533	protected function cleanImageUrl($imageUrl)
534	{
535		$result = preg_match('~(?:http(?:s)?:)?[\/a-zA-Z0-9\-_\.]+\.(?:jpg|gif|png|jpeg|ico){1}~', $imageUrl, $matches);
536		if(1 !== $result) {
537			return;
538		}
539		return urljoin($this->feedUri, $matches[0]);
540	}
541
542	/**
543	 * @param $typedResult
544	 * @return string
545	 */
546	protected function getItemValueOrNodeValue($typedResult)
547	{
548		if($typedResult instanceof DOMNodeList) {
549			$item = $typedResult->item(0);
550			if ($item instanceof DOMElement) {
551				return trim($item->nodeValue);
552			} elseif ($item instanceof DOMAttr) {
553				return trim($item->value);
554			}
555		} elseif(is_string($typedResult) && strlen($typedResult) > 0) {
556			return trim($typedResult);
557		}
558		returnServerError('Unknown type of XPath expression result.');
559	}
560
561	/**
562	 * Fixes feed encoding by invoking PHP's utf8_decode function on extracted texts.
563	 * Useful in case of "broken" or "weird" characters in the feed where you'd normally
564	 * expect umlauts.
565	 *
566	 * @param $input
567	 * @return string
568	 */
569	protected function fixEncoding($input)
570	{
571		return $this->getParam('fix_encoding') ? utf8_decode($input) : $input;
572	}
573
574	/**
575	 * Allows overriding default mechanism determining items Uid's
576	 *
577	 * @param FeedItem $item
578	 * @return string|null
579	 */
580	protected function generateItemId(\FeedItem $item) {
581		return null; //auto generation
582	}
583}
584