1<?php
2
3class XPathBridge extends XPathAbstract {
4	const NAME = 'XPathBridge';
5	const URI = 'https://github.com/rss-bridge/rss-bridge';
6	const DESCRIPTION
7		= 'Parse any webpage using <a href="https://devhints.io/xpath" target="_blank">XPath expressions</a>';
8	const MAINTAINER = 'Niehztog';
9	const PARAMETERS = array(
10		'' => array(
11
12			'url' => array(
13				'name' => 'Enter web page URL',
14				'title' => <<<"EOL"
15You can specify any website URL which serves data suited for display in RSS feeds
16(for example a news blog).
17EOL
18				, 'type' => 'text',
19				'exampleValue' => 'https://news.blizzard.com/en-en',
20				'defaultValue' => 'https://news.blizzard.com/en-en',
21				'required' => true
22			),
23
24			'item' => array(
25				'name' => 'Item selector',
26				'title' => <<<"EOL"
27Enter an XPath expression matching a list of dom nodes, each node containing one
28feed article item in total (usually a surrounding &lt;div&gt; or &lt;span&gt; tag). This will
29be the context nodes for all of the following expressions. This expression usually
30starts with a single forward slash.
31EOL
32				, 'type' => 'text',
33				'exampleValue' => '/html/body/div/div[4]/div[2]/div[2]/div/div/section/ol/li/article',
34				'defaultValue' => '/html/body/div/div[4]/div[2]/div[2]/div/div/section/ol/li/article',
35				'required' => true
36			),
37
38			'title' => array(
39				'name' => 'Item title selector',
40				'title' => <<<"EOL"
41This expression should match a node contained within each article item node
42containing the article headline. It should start with a dot followed by two
43forward slashes, referring to any descendant nodes of the article item node.
44EOL
45				, 'type' => 'text',
46				'exampleValue' => './/div/div[2]/h2',
47				'defaultValue' => './/div/div[2]/h2',
48				'required' => true
49			),
50
51			'content' => array(
52				'name' => 'Item description selector',
53				'title' => <<<"EOL"
54This expression should match a node contained within each article item node
55containing the article content or description. It should start with a dot
56followed by two forward slashes, referring to any descendant nodes of the
57article item node.
58EOL
59				, 'type' => 'text',
60				'exampleValue' => './/div[@class="ArticleListItem-description"]/div[@class="h6"]',
61				'defaultValue' => './/div[@class="ArticleListItem-description"]/div[@class="h6"]',
62				'required' => false
63			),
64
65			'uri' => array(
66				'name' => 'Item URL selector',
67				'title' => <<<"EOL"
68This expression should match a node's attribute containing the article URL
69(usually the href attribute of an &lt;a&gt; tag). It should start with a dot
70followed by two forward slashes, referring to any descendant nodes of
71the article item node. Attributes can be selected by prepending an @ char
72before the attributes name.
73EOL
74				, 'type' => 'text',
75				'exampleValue' => './/a[@class="ArticleLink ArticleLink"]/@href',
76				'defaultValue' => './/a[@class="ArticleLink ArticleLink"]/@href',
77				'required' => false
78			),
79
80			'author' => array(
81				'name' => 'Item author selector',
82				'title' => <<<"EOL"
83This expression should match a node contained within each article item
84node containing the article author's name. It should start with a dot
85followed by two forward slashes, referring to any descendant nodes of
86the article item node.
87EOL
88				, 'type' => 'text',
89				'required' => false
90			),
91
92			'timestamp' => array(
93				'name' => 'Item date selector',
94				'title' => <<<"EOL"
95This expression should match a node or node's attribute containing the
96article timestamp or date (parsable by PHP's strtotime function). It
97should start with a dot followed by two forward slashes, referring to
98any descendant nodes of the article item node. Attributes can be
99selected by prepending an @ char before the attributes name.
100EOL
101				, 'type' => 'text',
102				'exampleValue' => './/time[@class="ArticleListItem-footerTimestamp"]/@timestamp',
103				'defaultValue' => './/time[@class="ArticleListItem-footerTimestamp"]/@timestamp',
104				'required' => false
105			),
106
107			'enclosures' => array(
108				'name' => 'Item image selector',
109				'title' => <<<"EOL"
110This expression should match a node's attribute containing an article
111image URL (usually the src attribute of an &lt;img&gt; tag or a style
112attribute). It should start with a dot followed by two forward slashes,
113referring to any descendant nodes of the article item node. Attributes
114can be selected by prepending an @ char before the attributes name.
115EOL
116				, 'type' => 'text',
117				'exampleValue' => './/div[@class="ArticleListItem-image"]/@style',
118				'defaultValue' => './/div[@class="ArticleListItem-image"]/@style',
119				'required' => false
120			),
121
122			'categories' => array(
123				'name' => 'Item category selector',
124				'title' => <<<"EOL"
125This expression should match a node or node's attribute contained
126within each article item node containing the article category. This
127could be inside &lt;div&gt; or &lt;span&gt; tags or sometimes be hidden
128in a data attribute. It should start with a dot followed by two
129forward slashes, referring to any descendant nodes of the article
130item node. Attributes can be selected by prepending an @ char
131before the attributes name.
132EOL
133				, 'type' => 'text',
134				'exampleValue' => './/div[@class="ArticleListItem-label"]',
135				'defaultValue' => './/div[@class="ArticleListItem-label"]',
136				'required' => false
137			),
138
139			'fix_encoding' => array(
140				'name' => 'Fix encoding',
141				'title' => <<<"EOL"
142Check this to fix feed encoding by invoking PHP's utf8_decode
143function on all extracted texts. Try this in case you see "broken" or
144"weird" characters in your feed where you'd normally expect umlauts
145or any other non-ascii characters.
146EOL
147				, 'type' => 'checkbox',
148				'required' => false
149			),
150
151		)
152	);
153
154	/**
155	 * Source Web page URL (should provide either HTML or XML content)
156	 * @return string
157	 */
158	protected function getSourceUrl(){
159		return $this->encodeUri($this->getInput('url'));
160	}
161
162	/**
163	 * XPath expression for extracting the feed items from the source page
164	 * @return string
165	 */
166	protected function getExpressionItem(){
167		return urldecode($this->getInput('item'));
168	}
169
170	/**
171	 * XPath expression for extracting an item title from the item context
172	 * @return string
173	 */
174	protected function getExpressionItemTitle(){
175		return urldecode($this->getInput('title'));
176	}
177
178	/**
179	 * XPath expression for extracting an item's content from the item context
180	 * @return string
181	 */
182	protected function getExpressionItemContent(){
183		return urldecode($this->getInput('content'));
184	}
185
186	/**
187	 * XPath expression for extracting an item link from the item context
188	 * @return string
189	 */
190	protected function getExpressionItemUri(){
191		return urldecode($this->getInput('uri'));
192	}
193
194	/**
195	 * XPath expression for extracting an item author from the item context
196	 * @return string
197	 */
198	protected function getExpressionItemAuthor(){
199		return urldecode($this->getInput('author'));
200	}
201
202	/**
203	 * XPath expression for extracting an item timestamp from the item context
204	 * @return string
205	 */
206	protected function getExpressionItemTimestamp(){
207		return urldecode($this->getInput('timestamp'));
208	}
209
210	/**
211	 * XPath expression for extracting item enclosures (media content like
212	 * images or movies) from the item context
213	 * @return string
214	 */
215	protected function getExpressionItemEnclosures(){
216		return urldecode($this->getInput('enclosures'));
217	}
218
219	/**
220	 * XPath expression for extracting an item category from the item context
221	 * @return string
222	 */
223	protected function getExpressionItemCategories(){
224		return urldecode($this->getInput('categories'));
225	}
226
227	/**
228	 * Fix encoding
229	 * @return string
230	 */
231	protected function getSettingFixEncoding(){
232		return $this->getInput('fix_encoding');
233	}
234
235	/**
236	 * Fixes URL encoding issues in input URL's
237	 * @param $uri
238	 * @return string|string[]
239	 */
240	private function encodeUri($uri)
241	{
242		if (strpos($uri, 'https%3A%2F%2F') === 0
243			|| strpos($uri, 'http%3A%2F%2F') === 0) {
244			$uri = urldecode($uri);
245		}
246
247		$uri = str_replace('|', '%7C', $uri);
248
249		return $uri;
250	}
251}
252