1<?php
2
3/**
4 * Copyright (c) 2008, David R. Nadeau, NadeauSoftware.com.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 *	* Redistributions of source code must retain the above copyright
12 *	  notice, this list of conditions and the following disclaimer.
13 *
14 *	* Redistributions in binary form must reproduce the above
15 *	  copyright notice, this list of conditions and the following
16 *	  disclaimer in the documentation and/or other materials provided
17 *	  with the distribution.
18 *
19 *	* Neither the names of David R. Nadeau or NadeauSoftware.com, nor
20 *	  the names of its contributors may be used to endorse or promote
21 *	  products derived from this software without specific prior
22 *	  written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
27 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
28 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
29 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
30 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
31 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
32 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
34 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
35 * OF SUCH DAMAGE.
36 */
37
38/*
39 * This is a BSD License approved by the Open Source Initiative (OSI).
40 * See:  http://www.opensource.org/licenses/bsd-license.php
41 */
42
43defined('MOODLE_INTERNAL') || die();
44
45/**
46 * Combine a base URL and a relative URL to produce a new
47 * absolute URL.  The base URL is often the URL of a page,
48 * and the relative URL is a URL embedded on that page.
49 *
50 * This function implements the "absolutize" algorithm from
51 * the RFC3986 specification for URLs.
52 *
53 * This function supports multi-byte characters with the UTF-8 encoding,
54 * per the URL specification.
55 *
56 * Parameters:
57 * 	baseUrl		the absolute base URL.
58 *
59 * 	url		the relative URL to convert.
60 *
61 * Return values:
62 * 	An absolute URL that combines parts of the base and relative
63 * 	URLs, or FALSE if the base URL is not absolute or if either
64 * 	URL cannot be parsed.
65 */
66function url_to_absolute( $baseUrl, $relativeUrl )
67{
68	// If relative URL has a scheme, clean path and return.
69	$r = split_url( $relativeUrl );
70	if ( $r === FALSE )
71		return FALSE;
72	if ( !empty( $r['scheme'] ) )
73	{
74		if ( !empty( $r['path'] ) && $r['path'][0] == '/' )
75			$r['path'] = url_remove_dot_segments( $r['path'] );
76		return join_url( $r );
77	}
78
79	// Make sure the base URL is absolute.
80	$b = split_url( $baseUrl );
81	if ( $b === FALSE || empty( $b['scheme'] ) || empty( $b['host'] ) )
82		return FALSE;
83	$r['scheme'] = $b['scheme'];
84	if (empty($b['path'])) {
85		$b['path'] = '';
86	}
87
88	// If relative URL has an authority, clean path and return.
89	if ( isset( $r['host'] ) )
90	{
91		if ( !empty( $r['path'] ) )
92			$r['path'] = url_remove_dot_segments( $r['path'] );
93		return join_url( $r );
94	}
95	unset( $r['port'] );
96	unset( $r['user'] );
97	unset( $r['pass'] );
98
99	// Copy base authority.
100	$r['host'] = $b['host'];
101	if ( isset( $b['port'] ) ) $r['port'] = $b['port'];
102	if ( isset( $b['user'] ) ) $r['user'] = $b['user'];
103	if ( isset( $b['pass'] ) ) $r['pass'] = $b['pass'];
104
105	// If relative URL has no path, use base path
106	if ( empty( $r['path'] ) )
107	{
108		if ( !empty( $b['path'] ) )
109			$r['path'] = $b['path'];
110		if ( !isset( $r['query'] ) && isset( $b['query'] ) )
111			$r['query'] = $b['query'];
112		return join_url( $r );
113	}
114
115	// If relative URL path doesn't start with /, merge with base path.
116	if ($r['path'][0] != '/') {
117		$base = core_text::strrchr($b['path'], '/', TRUE);
118		if ($base === FALSE) {
119			$base = '';
120		}
121		$r['path'] = $base . '/' . $r['path'];
122	}
123	$r['path'] = url_remove_dot_segments($r['path']);
124	return join_url($r);
125}
126
127/**
128 * Filter out "." and ".." segments from a URL's path and return
129 * the result.
130 *
131 * This function implements the "remove_dot_segments" algorithm from
132 * the RFC3986 specification for URLs.
133 *
134 * This function supports multi-byte characters with the UTF-8 encoding,
135 * per the URL specification.
136 *
137 * Parameters:
138 * 	path	the path to filter
139 *
140 * Return values:
141 * 	The filtered path with "." and ".." removed.
142 */
143function url_remove_dot_segments( $path )
144{
145	// multi-byte character explode
146	$inSegs  = preg_split( '!/!u', $path );
147	$outSegs = array( );
148	foreach ( $inSegs as $seg )
149	{
150		if ( $seg == '' || $seg == '.')
151			continue;
152		if ( $seg == '..' )
153			array_pop( $outSegs );
154		else
155			array_push( $outSegs, $seg );
156	}
157	$outPath = implode( '/', $outSegs );
158
159	if ($path[0] == '/') {
160		$outPath = '/' . $outPath;
161	}
162
163	// Compare last multi-byte character against '/'.
164	if ($outPath != '/' && (core_text::strlen($path) - 1) == core_text::strrpos($path, '/', 'UTF-8')) {
165		$outPath .= '/';
166	}
167	return $outPath;
168}
169
170/**
171 * This function parses an absolute or relative URL and splits it
172 * into individual components.
173 *
174 * RFC3986 specifies the components of a Uniform Resource Identifier (URI).
175 * A portion of the ABNFs are repeated here:
176 *
177 *	URI-reference	= URI
178 *			/ relative-ref
179 *
180 *	URI		= scheme ":" hier-part [ "?" query ] [ "#" fragment ]
181 *
182 *	relative-ref	= relative-part [ "?" query ] [ "#" fragment ]
183 *
184 *	hier-part	= "//" authority path-abempty
185 *			/ path-absolute
186 *			/ path-rootless
187 *			/ path-empty
188 *
189 *	relative-part	= "//" authority path-abempty
190 *			/ path-absolute
191 *			/ path-noscheme
192 *			/ path-empty
193 *
194 *	authority	= [ userinfo "@" ] host [ ":" port ]
195 *
196 * So, a URL has the following major components:
197 *
198 *	scheme
199 *		The name of a method used to interpret the rest of
200 *		the URL.  Examples:  "http", "https", "mailto", "file'.
201 *
202 *	authority
203 *		The name of the authority governing the URL's name
204 *		space.  Examples:  "example.com", "user@example.com",
205 *		"example.com:80", "user:password@example.com:80".
206 *
207 *		The authority may include a host name, port number,
208 *		user name, and password.
209 *
210 *		The host may be a name, an IPv4 numeric address, or
211 *		an IPv6 numeric address.
212 *
213 *	path
214 *		The hierarchical path to the URL's resource.
215 *		Examples:  "/index.htm", "/scripts/page.php".
216 *
217 *	query
218 *		The data for a query.  Examples:  "?search=google.com".
219 *
220 *	fragment
221 *		The name of a secondary resource relative to that named
222 *		by the path.  Examples:  "#section1", "#header".
223 *
224 * An "absolute" URL must include a scheme and path.  The authority, query,
225 * and fragment components are optional.
226 *
227 * A "relative" URL does not include a scheme and must include a path.  The
228 * authority, query, and fragment components are optional.
229 *
230 * This function splits the $url argument into the following components
231 * and returns them in an associative array.  Keys to that array include:
232 *
233 *	"scheme"	The scheme, such as "http".
234 *	"host"		The host name, IPv4, or IPv6 address.
235 *	"port"		The port number.
236 *	"user"		The user name.
237 *	"pass"		The user password.
238 *	"path"		The path, such as a file path for "http".
239 *	"query"		The query.
240 *	"fragment"	The fragment.
241 *
242 * One or more of these may not be present, depending upon the URL.
243 *
244 * Optionally, the "user", "pass", "host" (if a name, not an IP address),
245 * "path", "query", and "fragment" may have percent-encoded characters
246 * decoded.  The "scheme" and "port" cannot include percent-encoded
247 * characters and are never decoded.  Decoding occurs after the URL has
248 * been parsed.
249 *
250 * Parameters:
251 * 	url		the URL to parse.
252 *
253 * 	decode		an optional boolean flag selecting whether
254 * 			to decode percent encoding or not.  Default = TRUE.
255 *
256 * Return values:
257 * 	the associative array of URL parts, or FALSE if the URL is
258 * 	too malformed to recognize any parts.
259 */
260function split_url( $url, $decode=FALSE)
261{
262	// Character sets from RFC3986.
263	$xunressub     = 'a-zA-Z\d\-._~\!$&\'()*+,;=';
264	$xpchar        = $xunressub . ':@% ';
265
266	// Scheme from RFC3986.
267	$xscheme        = '([a-zA-Z][a-zA-Z\d+-.]*)';
268
269	// User info (user + password) from RFC3986.
270	$xuserinfo     = '((['  . $xunressub . '%]*)' .
271	                 '(:([' . $xunressub . ':%]*))?)';
272
273	// IPv4 from RFC3986 (without digit constraints).
274	$xipv4         = '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})';
275
276	// IPv6 from RFC2732 (without digit and grouping constraints).
277	$xipv6         = '(\[([a-fA-F\d.:]+)\])';
278
279	// Host name from RFC1035.  Technically, must start with a letter.
280	// Relax that restriction to better parse URL structure, then
281	// leave host name validation to application.
282	$xhost_name    = '([a-zA-Z\d\-.%]+)';
283
284	// Authority from RFC3986.  Skip IP future.
285	$xhost         = '(' . $xhost_name . '|' . $xipv4 . '|' . $xipv6 . ')';
286	$xport         = '(\d*)';
287	$xauthority    = '((' . $xuserinfo . '@)?' . $xhost .
288		         '?(:' . $xport . ')?)';
289
290	// Path from RFC3986.  Blend absolute & relative for efficiency.
291	$xslash_seg    = '(/[' . $xpchar . ']*)';
292	$xpath_authabs = '((//' . $xauthority . ')((/[' . $xpchar . ']*)*))';
293	$xpath_rel     = '([' . $xpchar . ']+' . $xslash_seg . '*)';
294	$xpath_abs     = '(/(' . $xpath_rel . ')?)';
295	$xapath        = '(' . $xpath_authabs . '|' . $xpath_abs .
296			 '|' . $xpath_rel . ')';
297
298	// Query and fragment from RFC3986.
299	$xqueryfrag    = '([' . $xpchar . '/?' . ']*)';
300
301	// URL.
302	$xurl          = '^(' . $xscheme . ':)?' .  $xapath . '?' .
303	                 '(\?' . $xqueryfrag . ')?(#' . $xqueryfrag . ')?$';
304
305
306	// Split the URL into components.
307	if ( !preg_match( '!' . $xurl . '!', $url, $m ) )
308		return FALSE;
309
310	if ( !empty($m[2]) )		$parts['scheme']  = strtolower($m[2]);
311
312	if ( !empty($m[7]) ) {
313		if ( isset( $m[9] ) )	$parts['user']    = $m[9];
314		else			$parts['user']    = '';
315	}
316	if ( !empty($m[10]) )		$parts['pass']    = $m[11];
317
318	if ( !empty($m[13]) )		$h=$parts['host'] = $m[13];
319	else if ( !empty($m[14]) )	$parts['host']    = $m[14];
320	else if ( !empty($m[16]) )	$parts['host']    = $m[16];
321	else if ( !empty( $m[5] ) )	$parts['host']    = '';
322	if ( !empty($m[17]) )		$parts['port']    = $m[18];
323
324	if ( !empty($m[19]) )		$parts['path']    = $m[19];
325	else if ( !empty($m[21]) )	$parts['path']    = $m[21];
326	else if ( !empty($m[25]) )	$parts['path']    = $m[25];
327
328	if ( !empty($m[27]) )		$parts['query']   = $m[28];
329	if ( !empty($m[29]) )		$parts['fragment']= $m[30];
330
331	if ( !$decode )
332		return $parts;
333	if ( !empty($parts['user']) )
334		$parts['user']     = rawurldecode( $parts['user'] );
335	if ( !empty($parts['pass']) )
336		$parts['pass']     = rawurldecode( $parts['pass'] );
337	if ( !empty($parts['path']) )
338		$parts['path']     = rawurldecode( $parts['path'] );
339	if ( isset($h) )
340		$parts['host']     = rawurldecode( $parts['host'] );
341	if ( !empty($parts['query']) )
342		$parts['query']    = rawurldecode( $parts['query'] );
343	if ( !empty($parts['fragment']) )
344		$parts['fragment'] = rawurldecode( $parts['fragment'] );
345	return $parts;
346}
347
348/**
349 * This function joins together URL components to form a complete URL.
350 *
351 * RFC3986 specifies the components of a Uniform Resource Identifier (URI).
352 * This function implements the specification's "component recomposition"
353 * algorithm for combining URI components into a full URI string.
354 *
355 * The $parts argument is an associative array containing zero or
356 * more of the following:
357 *
358 *	"scheme"	The scheme, such as "http".
359 *	"host"		The host name, IPv4, or IPv6 address.
360 *	"port"		The port number.
361 *	"user"		The user name.
362 *	"pass"		The user password.
363 *	"path"		The path, such as a file path for "http".
364 *	"query"		The query.
365 *	"fragment"	The fragment.
366 *
367 * The "port", "user", and "pass" values are only used when a "host"
368 * is present.
369 *
370 * The optional $encode argument indicates if appropriate URL components
371 * should be percent-encoded as they are assembled into the URL.  Encoding
372 * is only applied to the "user", "pass", "host" (if a host name, not an
373 * IP address), "path", "query", and "fragment" components.  The "scheme"
374 * and "port" are never encoded.  When a "scheme" and "host" are both
375 * present, the "path" is presumed to be hierarchical and encoding
376 * processes each segment of the hierarchy separately (i.e., the slashes
377 * are left alone).
378 *
379 * The assembled URL string is returned.
380 *
381 * Parameters:
382 * 	parts		an associative array of strings containing the
383 * 			individual parts of a URL.
384 *
385 * 	encode		an optional boolean flag selecting whether
386 * 			to do percent encoding or not.  Default = true.
387 *
388 * Return values:
389 * 	Returns the assembled URL string.  The string is an absolute
390 * 	URL if a scheme is supplied, and a relative URL if not.  An
391 * 	empty string is returned if the $parts array does not contain
392 * 	any of the needed values.
393 */
394function join_url( $parts, $encode=FALSE)
395{
396	if ( $encode )
397	{
398		if ( isset( $parts['user'] ) )
399			$parts['user']     = rawurlencode( $parts['user'] );
400		if ( isset( $parts['pass'] ) )
401			$parts['pass']     = rawurlencode( $parts['pass'] );
402		if ( isset( $parts['host'] ) &&
403			!preg_match( '!^(\[[\da-f.:]+\]])|([\da-f.:]+)$!ui', $parts['host'] ) )
404			$parts['host']     = rawurlencode( $parts['host'] );
405		if ( !empty( $parts['path'] ) )
406			$parts['path']     = preg_replace( '!%2F!ui', '/',
407				rawurlencode( $parts['path'] ) );
408		if ( isset( $parts['query'] ) )
409			$parts['query']    = rawurlencode( $parts['query'] );
410		if ( isset( $parts['fragment'] ) )
411			$parts['fragment'] = rawurlencode( $parts['fragment'] );
412	}
413
414	$url = '';
415	if ( !empty( $parts['scheme'] ) )
416		$url .= $parts['scheme'] . ':';
417	if ( isset( $parts['host'] ) )
418	{
419		$url .= '//';
420		if ( isset( $parts['user'] ) )
421		{
422			$url .= $parts['user'];
423			if ( isset( $parts['pass'] ) )
424				$url .= ':' . $parts['pass'];
425			$url .= '@';
426		}
427		if ( preg_match( '!^[\da-f]*:[\da-f.:]+$!ui', $parts['host'] ) )
428			$url .= '[' . $parts['host'] . ']';	// IPv6
429		else
430			$url .= $parts['host'];			// IPv4 or name
431		if ( isset( $parts['port'] ) )
432			$url .= ':' . $parts['port'];
433		if ( !empty( $parts['path'] ) && $parts['path'][0] != '/' )
434			$url .= '/';
435	}
436	if ( !empty( $parts['path'] ) )
437		$url .= $parts['path'];
438	if ( isset( $parts['query'] ) )
439		$url .= '?' . $parts['query'];
440	if ( isset( $parts['fragment'] ) )
441		$url .= '#' . $parts['fragment'];
442	return $url;
443}
444
445/**
446 * This function encodes URL to form a URL which is properly
447 * percent encoded to replace disallowed characters.
448 *
449 * RFC3986 specifies the allowed characters in the URL as well as
450 * reserved characters in the URL. This function replaces all the
451 * disallowed characters in the URL with their repective percent
452 * encodings. Already encoded characters are not encoded again,
453 * such as '%20' is not encoded to '%2520'.
454 *
455 * Parameters:
456 * 	url		the url to encode.
457 *
458 * Return values:
459 * 	Returns the encoded URL string.
460 */
461function encode_url($url) {
462  $reserved = array(
463    ":" => '!%3A!ui',
464    "/" => '!%2F!ui',
465    "?" => '!%3F!ui',
466    "#" => '!%23!ui',
467    "[" => '!%5B!ui',
468    "]" => '!%5D!ui',
469    "@" => '!%40!ui',
470    "!" => '!%21!ui',
471    "$" => '!%24!ui',
472    "&" => '!%26!ui',
473    "'" => '!%27!ui',
474    "(" => '!%28!ui',
475    ")" => '!%29!ui',
476    "*" => '!%2A!ui',
477    "+" => '!%2B!ui',
478    "," => '!%2C!ui',
479    ";" => '!%3B!ui',
480    "=" => '!%3D!ui',
481    "%" => '!%25!ui',
482  );
483
484  $url = rawurlencode($url);
485  $url = preg_replace(array_values($reserved), array_keys($reserved), $url);
486  return $url;
487}
488
489/**
490 * Extract URLs from a web page.
491 *
492 * URLs are extracted from a long list of tags and attributes as defined
493 * by the HTML 2.0, HTML 3.2, HTML 4.01, and draft HTML 5.0 specifications.
494 * URLs are also extracted from tags and attributes that are common
495 * extensions of HTML, from the draft Forms 2.0 specification, from XHTML,
496 * and from WML 1.3 and 2.0.
497 *
498 * The function returns an associative array of associative arrays of
499 * arrays of URLs.  The outermost array's keys are the tag (element) name,
500 * such as "a" for <a> or "img" for <img>.  The values for these entries
501 * are associative arrays where the keys are attribute names for those
502 * tags, such as "href" for <a href="...">.  Finally, the values for
503 * those arrays are URLs found in those tags and attributes throughout
504 * the text.
505 *
506 * Parameters:
507 * 	text		the UTF-8 text to scan
508 *
509 * Return values:
510 * 	an associative array where keys are tags and values are an
511 * 	associative array where keys are attributes and values are
512 * 	an array of URLs.
513 *
514 * See:
515 * 	http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_web_page
516 */
517function extract_html_urls( $text )
518{
519	$match_elements = array(
520		// HTML
521		array('element'=>'a',		'attribute'=>'href'),		// 2.0
522		array('element'=>'a',		'attribute'=>'urn'),		// 2.0
523		array('element'=>'base',	'attribute'=>'href'),		// 2.0
524		array('element'=>'form',	'attribute'=>'action'),		// 2.0
525		array('element'=>'img',		'attribute'=>'src'),		// 2.0
526		array('element'=>'link',	'attribute'=>'href'),		// 2.0
527
528		array('element'=>'applet',	'attribute'=>'code'),		// 3.2
529		array('element'=>'applet',	'attribute'=>'codebase'),	// 3.2
530		array('element'=>'area',	'attribute'=>'href'),		// 3.2
531		array('element'=>'body',	'attribute'=>'background'),	// 3.2
532		array('element'=>'img',		'attribute'=>'usemap'),		// 3.2
533		array('element'=>'input',	'attribute'=>'src'),		// 3.2
534
535		array('element'=>'applet',	'attribute'=>'archive'),	// 4.01
536		array('element'=>'applet',	'attribute'=>'object'),		// 4.01
537		array('element'=>'blockquote',	'attribute'=>'cite'),		// 4.01
538		array('element'=>'del',		'attribute'=>'cite'),		// 4.01
539		array('element'=>'frame',	'attribute'=>'longdesc'),	// 4.01
540		array('element'=>'frame',	'attribute'=>'src'),		// 4.01
541		array('element'=>'head',	'attribute'=>'profile'),	// 4.01
542		array('element'=>'iframe',	'attribute'=>'longdesc'),	// 4.01
543		array('element'=>'iframe',	'attribute'=>'src'),		// 4.01
544		array('element'=>'img',		'attribute'=>'longdesc'),	// 4.01
545		array('element'=>'input',	'attribute'=>'usemap'),		// 4.01
546		array('element'=>'ins',		'attribute'=>'cite'),		// 4.01
547		array('element'=>'object',	'attribute'=>'archive'),	// 4.01
548		array('element'=>'object',	'attribute'=>'classid'),	// 4.01
549		array('element'=>'object',	'attribute'=>'codebase'),	// 4.01
550		array('element'=>'object',	'attribute'=>'data'),		// 4.01
551		array('element'=>'object',	'attribute'=>'usemap'),		// 4.01
552		array('element'=>'q',		'attribute'=>'cite'),		// 4.01
553		array('element'=>'script',	'attribute'=>'src'),		// 4.01
554
555		array('element'=>'audio',	'attribute'=>'src'),		// 5.0
556		array('element'=>'command',	'attribute'=>'icon'),		// 5.0
557		array('element'=>'embed',	'attribute'=>'src'),		// 5.0
558		array('element'=>'event-source','attribute'=>'src'),		// 5.0
559		array('element'=>'html',	'attribute'=>'manifest'),	// 5.0
560		array('element'=>'source',	'attribute'=>'src'),		// 5.0
561		array('element'=>'video',	'attribute'=>'src'),		// 5.0
562		array('element'=>'video',	'attribute'=>'poster'),		// 5.0
563
564		array('element'=>'bgsound',	'attribute'=>'src'),		// Extension
565		array('element'=>'body',	'attribute'=>'credits'),	// Extension
566		array('element'=>'body',	'attribute'=>'instructions'),	// Extension
567		array('element'=>'body',	'attribute'=>'logo'),		// Extension
568		array('element'=>'div',		'attribute'=>'href'),		// Extension
569		array('element'=>'div',		'attribute'=>'src'),		// Extension
570		array('element'=>'embed',	'attribute'=>'code'),		// Extension
571		array('element'=>'embed',	'attribute'=>'pluginspage'),	// Extension
572		array('element'=>'html',	'attribute'=>'background'),	// Extension
573		array('element'=>'ilayer',	'attribute'=>'src'),		// Extension
574		array('element'=>'img',		'attribute'=>'dynsrc'),		// Extension
575		array('element'=>'img',		'attribute'=>'lowsrc'),		// Extension
576		array('element'=>'input',	'attribute'=>'dynsrc'),		// Extension
577		array('element'=>'input',	'attribute'=>'lowsrc'),		// Extension
578		array('element'=>'table',	'attribute'=>'background'),	// Extension
579		array('element'=>'td',		'attribute'=>'background'),	// Extension
580		array('element'=>'th',		'attribute'=>'background'),	// Extension
581		array('element'=>'layer',	'attribute'=>'src'),		// Extension
582		array('element'=>'xml',		'attribute'=>'src'),		// Extension
583
584		array('element'=>'button',	'attribute'=>'action'),		// Forms 2.0
585		array('element'=>'datalist',	'attribute'=>'data'),		// Forms 2.0
586		array('element'=>'form',	'attribute'=>'data'),		// Forms 2.0
587		array('element'=>'input',	'attribute'=>'action'),		// Forms 2.0
588		array('element'=>'select',	'attribute'=>'data'),		// Forms 2.0
589
590		// XHTML
591		array('element'=>'html',	'attribute'=>'xmlns'),
592
593		// WML
594		array('element'=>'access',	'attribute'=>'path'),		// 1.3
595		array('element'=>'card',	'attribute'=>'onenterforward'),	// 1.3
596		array('element'=>'card',	'attribute'=>'onenterbackward'),// 1.3
597		array('element'=>'card',	'attribute'=>'ontimer'),	// 1.3
598		array('element'=>'go',		'attribute'=>'href'),		// 1.3
599		array('element'=>'option',	'attribute'=>'onpick'),		// 1.3
600		array('element'=>'template',	'attribute'=>'onenterforward'),	// 1.3
601		array('element'=>'template',	'attribute'=>'onenterbackward'),// 1.3
602		array('element'=>'template',	'attribute'=>'ontimer'),	// 1.3
603		array('element'=>'wml',		'attribute'=>'xmlns'),		// 2.0
604	);
605
606	$match_metas = array(
607		'content-base',
608		'content-location',
609		'referer',
610		'location',
611		'refresh',
612	);
613
614	// Extract all elements
615	if ( !preg_match_all( '/<([a-z][^>]*)>/iu', $text, $matches ) )
616		return array( );
617	$elements = $matches[1];
618	$value_pattern = '=(("([^"]*)")|([^\s]*))';
619
620	// Match elements and attributes
621	foreach ( $match_elements as $match_element )
622	{
623		$name = $match_element['element'];
624		$attr = $match_element['attribute'];
625		$pattern = '/^' . $name . '\s.*' . $attr . $value_pattern . '/iu';
626		if ( $name == 'object' )
627			$split_pattern = '/\s*/u';	// Space-separated URL list
628		else if ( $name == 'archive' )
629			$split_pattern = '/,\s*/u';	// Comma-separated URL list
630		else
631			unset( $split_pattern );	// Single URL
632		foreach ( $elements as $element )
633		{
634			if ( !preg_match( $pattern, $element, $match ) )
635				continue;
636			$m = empty($match[3]) ? (!empty($match[4])?$match[4]:'') : $match[3];
637			if ( !isset( $split_pattern ) )
638				$urls[$name][$attr][] = $m;
639			else
640			{
641				$msplit = preg_split( $split_pattern, $m );
642				foreach ( $msplit as $ms )
643					$urls[$name][$attr][] = $ms;
644			}
645		}
646	}
647
648	// Match meta http-equiv elements
649	foreach ( $match_metas as $match_meta )
650	{
651		$attr_pattern    = '/http-equiv="?' . $match_meta . '"?/iu';
652		$content_pattern = '/content'  . $value_pattern . '/iu';
653		$refresh_pattern = '/\d*;\s*(url=)?(.*)$/iu';
654		foreach ( $elements as $element )
655		{
656			if ( !preg_match( '/^meta/iu', $element ) ||
657				!preg_match( $attr_pattern, $element ) ||
658				!preg_match( $content_pattern, $element, $match ) )
659				continue;
660			$m = empty($match[3]) ? $match[4] : $match[3];
661			if ( $match_meta != 'refresh' )
662				$urls['meta']['http-equiv'][] = $m;
663			else if ( preg_match( $refresh_pattern, $m, $match ) )
664				$urls['meta']['http-equiv'][] = $match[2];
665		}
666	}
667
668	// Match style attributes
669	$urls['style'] = array( );
670	$style_pattern = '/style' . $value_pattern . '/iu';
671	foreach ( $elements as $element )
672	{
673		if ( !preg_match( $style_pattern, $element, $match ) )
674			continue;
675		$m = empty($match[3]) ? $match[4] : $match[3];
676		$style_urls = extract_css_urls( $m );
677		if ( !empty( $style_urls ) )
678			$urls['style'] = array_merge_recursive(
679				$urls['style'], $style_urls );
680	}
681
682	// Match style bodies
683	if ( preg_match_all( '/<style[^>]*>(.*?)<\/style>/siu', $text, $style_bodies ) )
684	{
685		foreach ( $style_bodies[1] as $style_body )
686		{
687			$style_urls = extract_css_urls( $style_body );
688			if ( !empty( $style_urls ) )
689				$urls['style'] = array_merge_recursive(
690					$urls['style'], $style_urls );
691		}
692	}
693	if ( empty($urls['style']) )
694		unset( $urls['style'] );
695
696	return $urls;
697}
698/**
699 * Extract URLs from UTF-8 CSS text.
700 *
701 * URLs within @import statements and url() property functions are extracted
702 * and returned in an associative array of arrays.  Array keys indicate
703 * the use context for the URL, including:
704 *
705 * 	"import"
706 * 	"property"
707 *
708 * Each value in the associative array is an array of URLs.
709 *
710 * Parameters:
711 * 	text		the UTF-8 text to scan
712 *
713 * Return values:
714 * 	an associative array of arrays of URLs.
715 *
716 * See:
717 * 	http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_css_file
718 */
719function extract_css_urls( $text )
720{
721	$urls = array( );
722
723	$url_pattern     = '(([^\\\\\'", \(\)]*(\\\\.)?)+)';
724	$urlfunc_pattern = 'url\(\s*[\'"]?' . $url_pattern . '[\'"]?\s*\)';
725	$pattern         = '/(' .
726		 '(@import\s*[\'"]' . $url_pattern     . '[\'"])' .
727		'|(@import\s*'      . $urlfunc_pattern . ')'      .
728		'|('                . $urlfunc_pattern . ')'      .  ')/iu';
729	if ( !preg_match_all( $pattern, $text, $matches ) )
730		return $urls;
731
732	// @import '...'
733	// @import "..."
734	foreach ( $matches[3] as $match )
735		if ( !empty($match) )
736			$urls['import'][] =
737				preg_replace( '/\\\\(.)/u', '\\1', $match );
738
739	// @import url(...)
740	// @import url('...')
741	// @import url("...")
742	foreach ( $matches[7] as $match )
743		if ( !empty($match) )
744			$urls['import'][] =
745				preg_replace( '/\\\\(.)/u', '\\1', $match );
746
747	// url(...)
748	// url('...')
749	// url("...")
750	foreach ( $matches[11] as $match )
751		if ( !empty($match) )
752			$urls['property'][] =
753				preg_replace( '/\\\\(.)/u', '\\1', $match );
754
755	return $urls;
756}
757