1<?php 2 3/** 4 * An alternative abstract class for bridges utilizing XPath expressions 5 * 6 * This class is meant as an alternative base class for bridge implementations. 7 * It offers preliminary functionality for generating feeds based on XPath 8 * expressions. 9 * As a minimum, extending classes should define XPath expressions pointing 10 * to the feed items contents in the class constants below. In case there is 11 * more manual fine tuning required, it offers a bunch of methods which can 12 * be overridden, for example in order to specify formatting of field values 13 * or more flexible definition of dynamic XPath expressions. 14 * 15 * This class extends {@see BridgeAbstract}, which means it incorporates and 16 * extends all of its functionality. 17 **/ 18abstract class XPathAbstract extends BridgeAbstract { 19 20 /** 21 * Source Web page URL (should provide either HTML or XML content) 22 * You can specify any website URL which serves data suited for display in RSS feeds 23 * (for example a news blog). 24 * 25 * Use {@see XPathAbstract::getSourceUrl()} to read this parameter 26 */ 27 const FEED_SOURCE_URL = ''; 28 29 /** 30 * XPath expression for extracting the feed title from the source page. 31 * If this is left blank or does not provide any data {@see BridgeAbstract::getName()} 32 * is used instead as the feed's title. 33 * 34 * Use {@see XPathAbstract::getExpressionTitle()} to read this parameter 35 */ 36 const XPATH_EXPRESSION_FEED_TITLE = './/title'; 37 38 /** 39 * XPath expression for extracting the feed favicon URL from the source page. 40 * If this is left blank or does not provide any data {@see BridgeAbstract::getIcon()} 41 * is used instead as the feed's favicon URL. 42 * 43 * Use {@see XPathAbstract::getExpressionIcon()} to read this parameter 44 */ 45 const XPATH_EXPRESSION_FEED_ICON = './/link[@rel="icon"]/@href'; 46 47 /** 48 * XPath expression for extracting the feed items from the source page 49 * Enter an XPath expression matching a list of dom nodes, each node containing one 50 * feed article item in total (usually a surrounding <div> or <span> tag). This will 51 * be the context nodes for all of the following expressions. This expression usually 52 * starts with a single forward slash. 53 * 54 * Use {@see XPathAbstract::getExpressionItem()} to read this parameter 55 */ 56 const XPATH_EXPRESSION_ITEM = ''; 57 58 /** 59 * XPath expression for extracting an item title from the item context 60 * This expression should match a node contained within each article item node 61 * containing the article headline. It should start with a dot followed by two 62 * forward slashes, referring to any descendant nodes of the article item node. 63 * 64 * Use {@see XPathAbstract::getExpressionItemTitle()} to read this parameter 65 */ 66 const XPATH_EXPRESSION_ITEM_TITLE = ''; 67 68 /** 69 * XPath expression for extracting an item's content from the item context 70 * This expression should match a node contained within each article item node 71 * containing the article content or description. It should start with a dot 72 * followed by two forward slashes, referring to any descendant nodes of the 73 * article item node. 74 * 75 * Use {@see XPathAbstract::getExpressionItemContent()} to read this parameter 76 */ 77 const XPATH_EXPRESSION_ITEM_CONTENT = ''; 78 79 /** 80 * XPath expression for extracting an item link from the item context 81 * This expression should match a node's attribute containing the article URL 82 * (usually the href attribute of an <a> tag). It should start with a dot 83 * followed by two forward slashes, referring to any descendant nodes of 84 * the article item node. Attributes can be selected by prepending an @ char 85 * before the attributes name. 86 * 87 * Use {@see XPathAbstract::getExpressionItemUri()} to read this parameter 88 */ 89 const XPATH_EXPRESSION_ITEM_URI = ''; 90 91 /** 92 * XPath expression for extracting an item author from the item context 93 * This expression should match a node contained within each article item 94 * node containing the article author's name. It should start with a dot 95 * followed by two forward slashes, referring to any descendant nodes of 96 * the article item node. 97 * 98 * Use {@see XPathAbstract::getExpressionItemAuthor()} to read this parameter 99 */ 100 const XPATH_EXPRESSION_ITEM_AUTHOR = ''; 101 102 /** 103 * XPath expression for extracting an item timestamp from the item context 104 * This expression should match a node or node's attribute containing the 105 * article timestamp or date (parsable by PHP's strtotime function). It 106 * should start with a dot followed by two forward slashes, referring to 107 * any descendant nodes of the article item node. Attributes can be 108 * selected by prepending an @ char before the attributes name. 109 * 110 * Use {@see XPathAbstract::getExpressionItemTimestamp()} to read this parameter 111 */ 112 const XPATH_EXPRESSION_ITEM_TIMESTAMP = ''; 113 114 /** 115 * XPath expression for extracting item enclosures (media content like 116 * images or movies) from the item context 117 * This expression should match a node's attribute containing an article 118 * image URL (usually the src attribute of an <img> tag or a style 119 * attribute). It should start with a dot followed by two forward slashes, 120 * referring to any descendant nodes of the article item node. Attributes 121 * can be selected by prepending an @ char before the attributes name. 122 * 123 * Use {@see XPathAbstract::getExpressionItemEnclosures()} to read this parameter 124 */ 125 const XPATH_EXPRESSION_ITEM_ENCLOSURES = ''; 126 127 /** 128 * XPath expression for extracting an item category from the item context 129 * This expression should match a node or node's attribute contained 130 * within each article item node containing the article category. This 131 * could be inside <div> or <span> tags or sometimes be hidden 132 * in a data attribute. It should start with a dot followed by two 133 * forward slashes, referring to any descendant nodes of the article 134 * item node. Attributes can be selected by prepending an @ char 135 * before the attributes name. 136 * 137 * Use {@see XPathAbstract::getExpressionItemCategories()} to read this parameter 138 */ 139 const XPATH_EXPRESSION_ITEM_CATEGORIES = ''; 140 141 /** 142 * Fix encoding 143 * Set this to true for fixing feed encoding by invoking PHP's utf8_decode 144 * function on all extracted texts. Try this in case you see "broken" or 145 * "weird" characters in your feed where you'd normally expect umlauts 146 * or any other non-ascii characters. 147 * 148 * Use {@see XPathAbstract::getSettingFixEncoding()} to read this parameter 149 */ 150 const SETTING_FIX_ENCODING = false; 151 152 /** 153 * Internal storage for resulting feed name, automatically detected 154 * @var string 155 */ 156 private $feedName; 157 158 /** 159 * Internal storage for resulting feed name, automatically detected 160 * @var string 161 */ 162 private $feedUri; 163 164 /** 165 * Internal storage for resulting feed favicon, automatically detected 166 * @var string 167 */ 168 private $feedIcon; 169 170 public function getName(){ 171 return $this->feedName ?: parent::getName(); 172 } 173 174 public function getURI() { 175 return $this->feedUri ?: parent::getURI(); 176 } 177 178 public function getIcon() { 179 return $this->feedIcon ?: parent::getIcon(); 180 } 181 182 /** 183 * Source Web page URL (should provide either HTML or XML content) 184 * @return string 185 */ 186 protected function getSourceUrl(){ 187 return static::FEED_SOURCE_URL; 188 } 189 190 /** 191 * XPath expression for extracting the feed title from the source page 192 * @return string 193 */ 194 protected function getExpressionTitle(){ 195 return static::XPATH_EXPRESSION_FEED_TITLE; 196 } 197 198 /** 199 * XPath expression for extracting the feed favicon from the source page 200 * @return string 201 */ 202 protected function getExpressionIcon(){ 203 return static::XPATH_EXPRESSION_FEED_ICON; 204 } 205 206 /** 207 * XPath expression for extracting the feed items from the source page 208 * @return string 209 */ 210 protected function getExpressionItem(){ 211 return static::XPATH_EXPRESSION_ITEM; 212 } 213 214 /** 215 * XPath expression for extracting an item title from the item context 216 * @return string 217 */ 218 protected function getExpressionItemTitle(){ 219 return static::XPATH_EXPRESSION_ITEM_TITLE; 220 } 221 222 /** 223 * XPath expression for extracting an item's content from the item context 224 * @return string 225 */ 226 protected function getExpressionItemContent(){ 227 return static::XPATH_EXPRESSION_ITEM_CONTENT; 228 } 229 230 /** 231 * XPath expression for extracting an item link from the item context 232 * @return string 233 */ 234 protected function getExpressionItemUri(){ 235 return static::XPATH_EXPRESSION_ITEM_URI; 236 } 237 238 /** 239 * XPath expression for extracting an item author from the item context 240 * @return string 241 */ 242 protected function getExpressionItemAuthor(){ 243 return static::XPATH_EXPRESSION_ITEM_AUTHOR; 244 } 245 246 /** 247 * XPath expression for extracting an item timestamp from the item context 248 * @return string 249 */ 250 protected function getExpressionItemTimestamp(){ 251 return static::XPATH_EXPRESSION_ITEM_TIMESTAMP; 252 } 253 254 /** 255 * XPath expression for extracting item enclosures (media content like 256 * images or movies) from the item context 257 * @return string 258 */ 259 protected function getExpressionItemEnclosures(){ 260 return static::XPATH_EXPRESSION_ITEM_ENCLOSURES; 261 } 262 263 /** 264 * XPath expression for extracting an item category from the item context 265 * @return string 266 */ 267 protected function getExpressionItemCategories(){ 268 return static::XPATH_EXPRESSION_ITEM_CATEGORIES; 269 } 270 271 /** 272 * Fix encoding 273 * @return string 274 */ 275 protected function getSettingFixEncoding(){ 276 return static::SETTING_FIX_ENCODING; 277 } 278 279 /** 280 * Internal helper method for quickly accessing all the user defined constants 281 * in derived classes 282 * 283 * @param $name 284 * @return bool|string 285 */ 286 private function getParam($name){ 287 switch($name) { 288 289 case 'url': 290 return $this->getSourceUrl(); 291 case 'feed_title': 292 return $this->getExpressionTitle(); 293 case 'feed_icon': 294 return $this->getExpressionIcon(); 295 case 'item': 296 return $this->getExpressionItem(); 297 case 'title': 298 return $this->getExpressionItemTitle(); 299 case 'content': 300 return $this->getExpressionItemContent(); 301 case 'uri': 302 return $this->getExpressionItemUri(); 303 case 'author': 304 return $this->getExpressionItemAuthor(); 305 case 'timestamp': 306 return $this->getExpressionItemTimestamp(); 307 case 'enclosures': 308 return $this->getExpressionItemEnclosures(); 309 case 'categories': 310 return $this->getExpressionItemCategories(); 311 case 'fix_encoding': 312 return $this->getSettingFixEncoding(); 313 } 314 } 315 316 /** 317 * Should provide the source website HTML content 318 * can be easily overwritten for example if special headers or auth infos are required 319 * @return string 320 */ 321 protected function provideWebsiteContent() { 322 return getContents($this->feedUri); 323 } 324 325 /** 326 * Should provide the feeds title 327 * 328 * @param DOMXPath $xpath 329 * @return string 330 */ 331 protected function provideFeedTitle(DOMXPath $xpath) { 332 $title = $xpath->query($this->getParam('feed_title')); 333 if(count($title) === 1) { 334 return $this->getItemValueOrNodeValue($title); 335 } 336 } 337 338 /** 339 * Should provide the URL of the feed's favicon 340 * 341 * @param DOMXPath $xpath 342 * @return string 343 */ 344 protected function provideFeedIcon(DOMXPath $xpath) { 345 $icon = $xpath->query($this->getParam('feed_icon')); 346 if(count($icon) === 1) { 347 return $this->cleanImageUrl($this->getItemValueOrNodeValue($icon)); 348 } 349 } 350 351 /** 352 * Should provide the feed's items. 353 * 354 * @param DOMXPath $xpath 355 * @return DOMNodeList 356 */ 357 protected function provideFeedItems(DOMXPath $xpath) { 358 return @$xpath->query($this->getParam('item')); 359 } 360 361 public function collectData() { 362 363 $this->feedUri = $this->getParam('url'); 364 365 $webPageHtml = new DOMDocument(); 366 libxml_use_internal_errors(true); 367 $webPageHtml->loadHTML($this->provideWebsiteContent()); 368 libxml_clear_errors(); 369 libxml_use_internal_errors(false); 370 371 $xpath = new DOMXPath($webPageHtml); 372 373 $this->feedName = $this->provideFeedTitle($xpath); 374 $this->feedIcon = $this->provideFeedIcon($xpath); 375 376 $entries = $this->provideFeedItems($xpath); 377 if($entries === false) { 378 return; 379 } 380 381 foreach ($entries as $entry) { 382 $item = new \FeedItem(); 383 foreach(array('title', 'content', 'uri', 'author', 'timestamp', 'enclosures', 'categories') as $param) { 384 385 $expression = $this->getParam($param); 386 if('' === $expression) { 387 continue; 388 } 389 390 //can be a string or DOMNodeList, depending on the expression result 391 $typedResult = @$xpath->evaluate($expression, $entry); 392 if ($typedResult === false || ($typedResult instanceof DOMNodeList && count($typedResult) === 0) 393 || (is_string($typedResult) && strlen(trim($typedResult)) === 0)) { 394 continue; 395 } 396 397 $item->__set($param, $this->formatParamValue($param, $this->getItemValueOrNodeValue($typedResult))); 398 399 } 400 401 $itemId = $this->generateItemId($item); 402 if(null !== $itemId) { 403 $item->setUid($itemId); 404 } 405 406 $this->items[] = $item; 407 } 408 409 } 410 411 /** 412 * @param $param 413 * @param $value 414 * @return string|array 415 */ 416 protected function formatParamValue($param, $value) 417 { 418 $value = $this->fixEncoding($value); 419 switch ($param) { 420 case 'title': 421 return $this->formatItemTitle($value); 422 case 'content': 423 return $this->formatItemContent($value); 424 case 'uri': 425 return $this->formatItemUri($value); 426 case 'author': 427 return $this->formatItemAuthor($value); 428 case 'timestamp': 429 return $this->formatItemTimestamp($value); 430 case 'enclosures': 431 return array($this->cleanImageUrl($value)); 432 case 'categories': 433 return array($this->fixEncoding($value)); 434 } 435 return $value; 436 } 437 438 /** 439 * Formats the title of a feed item. Takes extracted raw title and returns it formatted 440 * as string. 441 * Can be easily overwritten for in case the value needs to be transformed into something 442 * else. 443 * @param string $value 444 * @return string 445 */ 446 protected function formatItemTitle($value) { 447 return $value; 448 } 449 450 /** 451 * Formats the timestamp of a feed item. Takes extracted raw timestamp and returns unix 452 * timestamp as integer. 453 * Can be easily overwritten for example if a special format has to be expected on the 454 * source website. 455 * @param string $value 456 * @return string 457 */ 458 protected function formatItemContent($value) { 459 return $value; 460 } 461 462 /** 463 * Formats the URI of a feed item. Takes extracted raw URI and returns it formatted 464 * as string. 465 * Can be easily overwritten for in case the value needs to be transformed into something 466 * else. 467 * @param string $value 468 * @return string 469 */ 470 protected function formatItemUri($value) { 471 if(strlen($value) === 0) { 472 return ''; 473 } 474 if(strpos($value, 'http://') === 0 || strpos($value, 'https://') === 0) { 475 return $value; 476 } 477 478 return urljoin($this->feedUri, $value); 479 } 480 481 /** 482 * Formats the author of a feed item. Takes extracted raw author and returns it formatted 483 * as string. 484 * Can be easily overwritten for in case the value needs to be transformed into something 485 * else. 486 * @param string $value 487 * @return string 488 */ 489 protected function formatItemAuthor($value) { 490 return $value; 491 } 492 493 /** 494 * Formats the timestamp of a feed item. Takes extracted raw timestamp and returns unix 495 * timestamp as integer. 496 * Can be easily overwritten for example if a special format has to be expected on the 497 * source website. 498 * @param string $value 499 * @return false|int 500 */ 501 protected function formatItemTimestamp($value) { 502 return strtotime($value); 503 } 504 505 /** 506 * Formats the enclosures of a feed item. Takes extracted raw enclosures and returns them 507 * formatted as array. 508 * Can be easily overwritten for in case the values need to be transformed into something 509 * else. 510 * @param string $value 511 * @return array 512 */ 513 protected function formatItemEnclosures($value) { 514 return array($this->cleanImageUrl($value)); 515 } 516 517 /** 518 * Formats the categories of a feed item. Takes extracted raw categories and returns them 519 * formatted as array. 520 * Can be easily overwritten for in case the values need to be transformed into something 521 * else. 522 * @param string $value 523 * @return array 524 */ 525 protected function formatItemCategories($value) { 526 return array($value); 527 } 528 529 /** 530 * @param $imageUrl 531 * @return string|void 532 */ 533 protected function cleanImageUrl($imageUrl) 534 { 535 $result = preg_match('~(?:http(?:s)?:)?[\/a-zA-Z0-9\-_\.]+\.(?:jpg|gif|png|jpeg|ico){1}~', $imageUrl, $matches); 536 if(1 !== $result) { 537 return; 538 } 539 return urljoin($this->feedUri, $matches[0]); 540 } 541 542 /** 543 * @param $typedResult 544 * @return string 545 */ 546 protected function getItemValueOrNodeValue($typedResult) 547 { 548 if($typedResult instanceof DOMNodeList) { 549 $item = $typedResult->item(0); 550 if ($item instanceof DOMElement) { 551 return trim($item->nodeValue); 552 } elseif ($item instanceof DOMAttr) { 553 return trim($item->value); 554 } 555 } elseif(is_string($typedResult) && strlen($typedResult) > 0) { 556 return trim($typedResult); 557 } 558 returnServerError('Unknown type of XPath expression result.'); 559 } 560 561 /** 562 * Fixes feed encoding by invoking PHP's utf8_decode function on extracted texts. 563 * Useful in case of "broken" or "weird" characters in the feed where you'd normally 564 * expect umlauts. 565 * 566 * @param $input 567 * @return string 568 */ 569 protected function fixEncoding($input) 570 { 571 return $this->getParam('fix_encoding') ? utf8_decode($input) : $input; 572 } 573 574 /** 575 * Allows overriding default mechanism determining items Uid's 576 * 577 * @param FeedItem $item 578 * @return string|null 579 */ 580 protected function generateItemId(\FeedItem $item) { 581 return null; //auto generation 582 } 583} 584