1<?php
2
3/**
4 * Parses a URI into the components and fragment identifier as specified
5 * by RFC 3986.
6 */
7class HTMLPurifier_URIParser
8{
9
10    /**
11     * Instance of HTMLPurifier_PercentEncoder to do normalization with.
12     */
13    protected $percentEncoder;
14
15    public function __construct()
16    {
17        $this->percentEncoder = new HTMLPurifier_PercentEncoder();
18    }
19
20    /**
21     * Parses a URI.
22     * @param $uri string URI to parse
23     * @return HTMLPurifier_URI representation of URI. This representation has
24     *         not been validated yet and may not conform to RFC.
25     */
26    public function parse($uri)
27    {
28        $uri = $this->percentEncoder->normalize($uri);
29
30        // Regexp is as per Appendix B.
31        // Note that ["<>] are an addition to the RFC's recommended
32        // characters, because they represent external delimeters.
33        $r_URI = '!'.
34            '(([a-zA-Z0-9\.\+\-]+):)?'. // 2. Scheme
35            '(//([^/?#"<>]*))?'. // 4. Authority
36            '([^?#"<>]*)'.       // 5. Path
37            '(\?([^#"<>]*))?'.   // 7. Query
38            '(#([^"<>]*))?'.     // 8. Fragment
39            '!';
40
41        $matches = array();
42        $result = preg_match($r_URI, $uri, $matches);
43
44        if (!$result) return false; // *really* invalid URI
45
46        // seperate out parts
47        $scheme     = !empty($matches[1]) ? $matches[2] : null;
48        $authority  = !empty($matches[3]) ? $matches[4] : null;
49        $path       = $matches[5]; // always present, can be empty
50        $query      = !empty($matches[6]) ? $matches[7] : null;
51        $fragment   = !empty($matches[8]) ? $matches[9] : null;
52
53        // further parse authority
54        if ($authority !== null) {
55            $r_authority = "/^((.+?)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
56            $matches = array();
57            preg_match($r_authority, $authority, $matches);
58            $userinfo   = !empty($matches[1]) ? $matches[2] : null;
59            $host       = !empty($matches[3]) ? $matches[3] : '';
60            $port       = !empty($matches[4]) ? (int) $matches[5] : null;
61        } else {
62            $port = $host = $userinfo = null;
63        }
64
65        return new HTMLPurifier_URI(
66            $scheme, $userinfo, $host, $port, $path, $query, $fragment);
67    }
68
69}
70
71// vim: et sw=4 sts=4
72