1<?php 2 3class XPathBridge extends XPathAbstract { 4 const NAME = 'XPathBridge'; 5 const URI = 'https://github.com/rss-bridge/rss-bridge'; 6 const DESCRIPTION 7 = 'Parse any webpage using <a href="https://devhints.io/xpath" target="_blank">XPath expressions</a>'; 8 const MAINTAINER = 'Niehztog'; 9 const PARAMETERS = array( 10 '' => array( 11 12 'url' => array( 13 'name' => 'Enter web page URL', 14 'title' => <<<"EOL" 15You can specify any website URL which serves data suited for display in RSS feeds 16(for example a news blog). 17EOL 18 , 'type' => 'text', 19 'exampleValue' => 'https://news.blizzard.com/en-en', 20 'defaultValue' => 'https://news.blizzard.com/en-en', 21 'required' => true 22 ), 23 24 'item' => array( 25 'name' => 'Item selector', 26 'title' => <<<"EOL" 27Enter an XPath expression matching a list of dom nodes, each node containing one 28feed article item in total (usually a surrounding <div> or <span> tag). This will 29be the context nodes for all of the following expressions. This expression usually 30starts with a single forward slash. 31EOL 32 , 'type' => 'text', 33 'exampleValue' => '/html/body/div/div[4]/div[2]/div[2]/div/div/section/ol/li/article', 34 'defaultValue' => '/html/body/div/div[4]/div[2]/div[2]/div/div/section/ol/li/article', 35 'required' => true 36 ), 37 38 'title' => array( 39 'name' => 'Item title selector', 40 'title' => <<<"EOL" 41This expression should match a node contained within each article item node 42containing the article headline. It should start with a dot followed by two 43forward slashes, referring to any descendant nodes of the article item node. 44EOL 45 , 'type' => 'text', 46 'exampleValue' => './/div/div[2]/h2', 47 'defaultValue' => './/div/div[2]/h2', 48 'required' => true 49 ), 50 51 'content' => array( 52 'name' => 'Item description selector', 53 'title' => <<<"EOL" 54This expression should match a node contained within each article item node 55containing the article content or description. It should start with a dot 56followed by two forward slashes, referring to any descendant nodes of the 57article item node. 58EOL 59 , 'type' => 'text', 60 'exampleValue' => './/div[@class="ArticleListItem-description"]/div[@class="h6"]', 61 'defaultValue' => './/div[@class="ArticleListItem-description"]/div[@class="h6"]', 62 'required' => false 63 ), 64 65 'uri' => array( 66 'name' => 'Item URL selector', 67 'title' => <<<"EOL" 68This expression should match a node's attribute containing the article URL 69(usually the href attribute of an <a> tag). It should start with a dot 70followed by two forward slashes, referring to any descendant nodes of 71the article item node. Attributes can be selected by prepending an @ char 72before the attributes name. 73EOL 74 , 'type' => 'text', 75 'exampleValue' => './/a[@class="ArticleLink ArticleLink"]/@href', 76 'defaultValue' => './/a[@class="ArticleLink ArticleLink"]/@href', 77 'required' => false 78 ), 79 80 'author' => array( 81 'name' => 'Item author selector', 82 'title' => <<<"EOL" 83This expression should match a node contained within each article item 84node containing the article author's name. It should start with a dot 85followed by two forward slashes, referring to any descendant nodes of 86the article item node. 87EOL 88 , 'type' => 'text', 89 'required' => false 90 ), 91 92 'timestamp' => array( 93 'name' => 'Item date selector', 94 'title' => <<<"EOL" 95This expression should match a node or node's attribute containing the 96article timestamp or date (parsable by PHP's strtotime function). It 97should start with a dot followed by two forward slashes, referring to 98any descendant nodes of the article item node. Attributes can be 99selected by prepending an @ char before the attributes name. 100EOL 101 , 'type' => 'text', 102 'exampleValue' => './/time[@class="ArticleListItem-footerTimestamp"]/@timestamp', 103 'defaultValue' => './/time[@class="ArticleListItem-footerTimestamp"]/@timestamp', 104 'required' => false 105 ), 106 107 'enclosures' => array( 108 'name' => 'Item image selector', 109 'title' => <<<"EOL" 110This expression should match a node's attribute containing an article 111image URL (usually the src attribute of an <img> tag or a style 112attribute). It should start with a dot followed by two forward slashes, 113referring to any descendant nodes of the article item node. Attributes 114can be selected by prepending an @ char before the attributes name. 115EOL 116 , 'type' => 'text', 117 'exampleValue' => './/div[@class="ArticleListItem-image"]/@style', 118 'defaultValue' => './/div[@class="ArticleListItem-image"]/@style', 119 'required' => false 120 ), 121 122 'categories' => array( 123 'name' => 'Item category selector', 124 'title' => <<<"EOL" 125This expression should match a node or node's attribute contained 126within each article item node containing the article category. This 127could be inside <div> or <span> tags or sometimes be hidden 128in a data attribute. It should start with a dot followed by two 129forward slashes, referring to any descendant nodes of the article 130item node. Attributes can be selected by prepending an @ char 131before the attributes name. 132EOL 133 , 'type' => 'text', 134 'exampleValue' => './/div[@class="ArticleListItem-label"]', 135 'defaultValue' => './/div[@class="ArticleListItem-label"]', 136 'required' => false 137 ), 138 139 'fix_encoding' => array( 140 'name' => 'Fix encoding', 141 'title' => <<<"EOL" 142Check this to fix feed encoding by invoking PHP's utf8_decode 143function on all extracted texts. Try this in case you see "broken" or 144"weird" characters in your feed where you'd normally expect umlauts 145or any other non-ascii characters. 146EOL 147 , 'type' => 'checkbox', 148 'required' => false 149 ), 150 151 ) 152 ); 153 154 /** 155 * Source Web page URL (should provide either HTML or XML content) 156 * @return string 157 */ 158 protected function getSourceUrl(){ 159 return $this->encodeUri($this->getInput('url')); 160 } 161 162 /** 163 * XPath expression for extracting the feed items from the source page 164 * @return string 165 */ 166 protected function getExpressionItem(){ 167 return urldecode($this->getInput('item')); 168 } 169 170 /** 171 * XPath expression for extracting an item title from the item context 172 * @return string 173 */ 174 protected function getExpressionItemTitle(){ 175 return urldecode($this->getInput('title')); 176 } 177 178 /** 179 * XPath expression for extracting an item's content from the item context 180 * @return string 181 */ 182 protected function getExpressionItemContent(){ 183 return urldecode($this->getInput('content')); 184 } 185 186 /** 187 * XPath expression for extracting an item link from the item context 188 * @return string 189 */ 190 protected function getExpressionItemUri(){ 191 return urldecode($this->getInput('uri')); 192 } 193 194 /** 195 * XPath expression for extracting an item author from the item context 196 * @return string 197 */ 198 protected function getExpressionItemAuthor(){ 199 return urldecode($this->getInput('author')); 200 } 201 202 /** 203 * XPath expression for extracting an item timestamp from the item context 204 * @return string 205 */ 206 protected function getExpressionItemTimestamp(){ 207 return urldecode($this->getInput('timestamp')); 208 } 209 210 /** 211 * XPath expression for extracting item enclosures (media content like 212 * images or movies) from the item context 213 * @return string 214 */ 215 protected function getExpressionItemEnclosures(){ 216 return urldecode($this->getInput('enclosures')); 217 } 218 219 /** 220 * XPath expression for extracting an item category from the item context 221 * @return string 222 */ 223 protected function getExpressionItemCategories(){ 224 return urldecode($this->getInput('categories')); 225 } 226 227 /** 228 * Fix encoding 229 * @return string 230 */ 231 protected function getSettingFixEncoding(){ 232 return $this->getInput('fix_encoding'); 233 } 234 235 /** 236 * Fixes URL encoding issues in input URL's 237 * @param $uri 238 * @return string|string[] 239 */ 240 private function encodeUri($uri) 241 { 242 if (strpos($uri, 'https%3A%2F%2F') === 0 243 || strpos($uri, 'http%3A%2F%2F') === 0) { 244 $uri = urldecode($uri); 245 } 246 247 $uri = str_replace('|', '%7C', $uri); 248 249 return $uri; 250 } 251} 252