1 /* Copyright 2002-2006, 2018 Elliotte Rusty Harold
2 
3    This library is free software; you can redistribute it and/or modify
4    it under the terms of version 2.1 of the GNU Lesser General Public
5    License as published by the Free Software Foundation.
6 
7    This library is distributed in the hope that it will be useful,
8    but WITHOUT ANY WARRANTY; without even the implied warranty of
9    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10    GNU Lesser General Public License for more details.
11 
12    You should have received a copy of the GNU Lesser General Public
13    License along with this library; if not, write to the
14    Free Software Foundation, Inc., 59 Temple Place, Suite 330,
15    Boston, MA 02111-1307  USA
16 
17    You can contact Elliotte Rusty Harold by sending e-mail to
18    elharo@ibiblio.org. Please include the word "XOM" in the
19    subject line. The XOM home page is located at http://www.xom.nu/
20 */
21 
22 package nu.xom;
23 
24 import java.io.DataInputStream;
25 import java.io.IOException;
26 import java.io.InputStream;
27 import java.io.Reader;
28 import java.io.StringReader;
29 import java.util.StringTokenizer;
30 
31 import org.xml.sax.EntityResolver;
32 import org.xml.sax.InputSource;
33 import org.xml.sax.SAXException;
34 import org.xml.sax.XMLReader;
35 
36 /**
37  * <p>
38  * <code>Verifier</code> checks names and data for
39  * compliance with XML 1.0 and Namespaces in XML rules.
40  * </p>
41  *
42  * @author Elliotte Rusty Harold
43  * @version 1.2.11
44  *
45  */
46 final class Verifier {
47 
Verifier()48     private Verifier() {}
49 
50     // constants for the bit flags in the characters lookup table
51     private final static byte XML_CHARACTER        = 1;
52     private final static byte NAME_CHARACTER       = 2;
53     private final static byte NAME_START_CHARACTER = 4;
54     private final static byte NCNAME_CHARACTER     = 8;
55 
56     private       static byte[] flags = null;
57 
58     static {
59 
60         ClassLoader loader = Verifier.class.getClassLoader();
61         if (loader != null) loadFlags(loader);
62         // If that didn't work, try a different ClassLoader
63         if (flags == null) {
64             loader = Thread.currentThread().getContextClassLoader();
65             loadFlags(loader);
66         }
67 
68     }
69 
70 
loadFlags(ClassLoader loader)71     private static void loadFlags(ClassLoader loader) {
72 
73         DataInputStream in = null;
74         try {
75             InputStream raw = loader.getResourceAsStream("nu/xom/characters.dat");
76             if (raw == null) {
77                 throw new RuntimeException("Broken XOM installation: "
78                   + "could not load nu/xom/characters.dat");
79             }
80             // buffer this????
81             in = new DataInputStream(raw);
82             flags = new byte[65536];
83             in.readFully(flags);
84         }
85         catch (IOException ex) {
86             throw new RuntimeException("Broken XOM installation: "
87               + "could not load nu/xom/characters.dat");
88         }
89         finally {
90             try {
91                 if (in != null) in.close();
92             }
93             catch (IOException ex) {
94                 // no big deal
95             }
96         }
97 
98     }
99 
100 
101     /**
102      * <p>
103      * Check whether <code>name</code> is
104      * a non-colonized name as defined in
105      * <cite>Namespaces in XML</cite>.
106      * </p>
107      *
108      * @param name <code>String</code> name to check
109      *
110      * @throws IllegalNameException if <code>name</code> is not a
111      *     non-colonized name
112      */
checkNCName(String name)113     static void checkNCName(String name) {
114 
115         if (name == null) {
116             throwIllegalNameException(name, "NCNames cannot be null");
117         }
118 
119         int length = name.length();
120         if (length == 0) {
121             throwIllegalNameException(name, "NCNames cannot be empty");
122         }
123 
124         char first = name.charAt(0);
125         if ((flags[first] & NAME_START_CHARACTER) == 0) {
126             throwIllegalNameException(name, "NCNames cannot start " +
127               "with the character " + Integer.toHexString(first));
128         }
129 
130         for (int i = 1; i < length; i++) {
131             char c = name.charAt(i);
132             if ((flags[c] & NCNAME_CHARACTER) == 0) {
133                 if (c == ':') {
134                     throwIllegalNameException(name, "NCNames cannot contain colons");
135                 }
136                 else {
137                     throwIllegalNameException(name, "0x"
138                       + Integer.toHexString(c) + " is not a legal NCName character");
139                 }
140             }
141         }
142 
143     }
144 
145 
throwIllegalNameException(String name, String message)146     private static void throwIllegalNameException(String name, String message) {
147         IllegalNameException ex = new IllegalNameException(message);
148         ex.setData(name);
149         throw ex;
150     }
151 
152 
throwIllegalCharacterDataException(String data, String message)153     private static void throwIllegalCharacterDataException(String data, String message) {
154         IllegalDataException ex = new IllegalCharacterDataException(message);
155         ex.setData(data);
156         throw ex;
157     }
158 
159 
throwMalformedURIException(String uri, String message)160     private static void throwMalformedURIException(String uri, String message) {
161         MalformedURIException ex = new MalformedURIException(message);
162         ex.setData(uri);
163         throw ex;
164     }
165 
166 
167     /**
168      * <p>
169      * This methods checks whether a string contains only
170      * characters allowed by the XML 1.0 specification.
171      * </p>
172      *
173      * @param text <code>String</code> value to verify
174      *
175      * @throws IllegalCharacterDataException if <code>text</code> is
176      *     not legal PCDATA
177      */
checkPCDATA(String text)178     static void checkPCDATA(String text) {
179 
180         if (text == null) throw new IllegalCharacterDataException("Null text");
181 
182         char[] data = text.toCharArray();
183         for (int i = 0, len = data.length; i < len; i++) {
184             int result = data[i];
185             if (result >= 0xD800 && result <= 0xDBFF) {
186                 try {
187                     int low = data[i+1];
188                     if (low < 0xDC00 || low > 0xDFFF) {
189                         IllegalCharacterDataException ex
190                           = new IllegalCharacterDataException("Bad surrogate pair");
191                         ex.setData(text);
192                         throw ex;
193                     }
194                     i++; // increment past low surrogate
195                 }
196                 catch (ArrayIndexOutOfBoundsException ex) {
197                     IllegalCharacterDataException ide
198                       = new IllegalCharacterDataException("Bad Surrogate Pair", ex);
199                     ide.setData(text);
200                     throw ide;
201                 }
202                 // all properly matched surrogate pairs are legal in PCDATA
203             }  // end if
204             else if ((flags[result] & XML_CHARACTER) == 0) {
205                 throwIllegalCharacterDataException(text, "0x"
206                   + Integer.toHexString(result)
207                   + " is not allowed in XML content");
208             }
209 
210         }
211 
212     }
213 
214 
215     /**
216      * <p>
217      * Checks a string to see if it is a syntactically correct
218      * RFC 3986 URI reference. Both absolute and relative
219      * URIs are supported, as are URIs with fragment identifiers.
220      * </p>
221      *
222      * @param uri <code>String</code> containing the potential URI
223      *
224      * @throws MalformedURIException if this is not a
225      *     legal URI reference
226      */
checkURIReference(String uri)227     static void checkURIReference(String uri) {
228 
229         if ((uri == null) || uri.length() == 0) return;
230 
231         URIUtil.ParsedURI parsed = new URIUtil.ParsedURI(uri);
232         try {
233             if (parsed.scheme != null) checkScheme(parsed.scheme);
234             if (parsed.authority != null) checkAuthority(parsed.authority);
235             checkPath(parsed.path);
236             if (parsed.fragment != null) checkFragment(parsed.fragment);
237             if (parsed.query != null) checkQuery(parsed.query);
238         }
239         catch (MalformedURIException ex) {
240             ex.setData(uri);
241             throw ex;
242         }
243 
244     }
245 
246 
checkQuery(final String query)247     private static void checkQuery(final String query) {
248 
249         int length = query.length();
250         for (int i = 0; i < length; i++) {
251             char c = query.charAt(i);
252             if (c == '%') {
253                try {
254                    if (!isHexDigit(query.charAt(i+1)) || !isHexDigit(query.charAt(i+2))) {
255                        throwMalformedURIException(query,
256                          "Bad percent escape sequence");
257                    }
258                }
259                catch (StringIndexOutOfBoundsException ex) {
260                    throwMalformedURIException(query,
261                      "Bad percent escape sequence");
262                }
263                i += 2;
264             }
265             else if (!isQueryCharacter(c)) {
266                 throw new MalformedURIException(
267                   "Illegal query character " + c
268                 );
269             }
270         }
271 
272     }
273 
274 
275     // same for fragment ID
isQueryCharacter(char c)276     private static boolean isQueryCharacter(char c) {
277 
278         switch(c) {
279             case '!': return true;
280             case '"': return false;
281             case '#': return false;
282             case '$': return true;
283             case '%': return false; // tested in checkQuery
284             case '&': return true;
285             case '\'': return true;
286             case '(': return true;
287             case ')': return true;
288             case '*': return true;
289             case '+': return true;
290             case ',': return true;
291             case '-': return true;
292             case '.': return true;
293             case '/': return true;
294             case '0': return true;
295             case '1': return true;
296             case '2': return true;
297             case '3': return true;
298             case '4': return true;
299             case '5': return true;
300             case '6': return true;
301             case '7': return true;
302             case '8': return true;
303             case '9': return true;
304             case ':': return true;
305             case ';': return true;
306             case '<': return false;
307             case '=': return true;
308             case '>': return false;
309             case '?': return true;
310             case '@': return true;
311             case 'A': return true;
312             case 'B': return true;
313             case 'C': return true;
314             case 'D': return true;
315             case 'E': return true;
316             case 'F': return true;
317             case 'G': return true;
318             case 'H': return true;
319             case 'I': return true;
320             case 'J': return true;
321             case 'K': return true;
322             case 'L': return true;
323             case 'M': return true;
324             case 'N': return true;
325             case 'O': return true;
326             case 'P': return true;
327             case 'Q': return true;
328             case 'R': return true;
329             case 'S': return true;
330             case 'T': return true;
331             case 'U': return true;
332             case 'V': return true;
333             case 'W': return true;
334             case 'X': return true;
335             case 'Y': return true;
336             case 'Z': return true;
337             case '[': return false;
338             case '\\': return false;
339             case ']': return false;
340             case '^': return false;
341             case '_': return true;
342             case '`': return false;
343             case 'a': return true;
344             case 'b': return true;
345             case 'c': return true;
346             case 'd': return true;
347             case 'e': return true;
348             case 'f': return true;
349             case 'g': return true;
350             case 'h': return true;
351             case 'i': return true;
352             case 'j': return true;
353             case 'k': return true;
354             case 'l': return true;
355             case 'm': return true;
356             case 'n': return true;
357             case 'o': return true;
358             case 'p': return true;
359             case 'q': return true;
360             case 'r': return true;
361             case 's': return true;
362             case 't': return true;
363             case 'u': return true;
364             case 'v': return true;
365             case 'w': return true;
366             case 'x': return true;
367             case 'y': return true;
368             case 'z': return true;
369             case '{': return false;
370             case '|': return false;
371             case '}': return false;
372             case '~': return true;
373         }
374         return false;
375 
376     }
377 
378 
checkFragment(String fragment)379     private static void checkFragment(String fragment) {
380         // The BNF for fragments is the same as for query strings
381         checkQuery(fragment);
382     }
383 
384 
385     // Besides the legal characters issues, a path must
386     // not contain two consecutive forward slashes
checkPath(final String path)387     private static void checkPath(final String path) {
388 
389         int length = path.length();
390         char[] text = path.toCharArray();
391         for (int i = 0; i < length; i++) {
392             char c = text[i];
393             if (c == '/') {
394                 if (i < length-1) {
395                     if (text[i+1] == '/') {
396                         throwMalformedURIException(path,
397                           "Double slash (//) in path");
398                     }
399                 }
400             }
401             else if (c == '%') {
402                try {
403                    if (!isHexDigit(text[i+1])
404                      || !isHexDigit(text[i+2])) {
405                        throwMalformedURIException(path,
406                          "Bad percent escape sequence");
407                    }
408                }
409                catch (ArrayIndexOutOfBoundsException ex) {
410                    throwMalformedURIException(path,
411                      "Bad percent escape sequence");
412                }
413                i += 2;
414             }
415             else if (!isPathCharacter(c)) {
416                 throwMalformedURIException(path,
417                   "Illegal path character " + c
418                 );
419             }
420         }
421 
422     }
423 
424 
checkAuthority(String authority)425     private static void checkAuthority(String authority) {
426 
427         String userInfo = null;
428         String host = null;
429         String port = null;
430 
431         int atSign = authority.indexOf('@');
432         if (atSign != -1) {
433             userInfo = authority.substring(0, atSign);
434             authority = authority.substring(atSign+1);
435         }
436 
437         int colon;
438         if (authority.startsWith("[")) {
439             colon = authority.indexOf("]:");
440             if (colon != -1) colon = colon+1;
441         }
442         else colon = authority.indexOf(':');
443 
444         if (colon != -1) {
445             host = authority.substring(0, colon);
446             port = authority.substring(colon+1);
447         }
448         else {
449             host = authority;
450         }
451 
452         if (userInfo != null) checkUserInfo(userInfo);
453         if (port != null) checkPort(port);
454         checkHost(host);
455 
456     }
457 
458 
checkHost(final String host)459     private static void checkHost(final String host) {
460 
461         int length = host.length();
462         if (length == 0) return; // file URI
463 
464         char[] text = host.toCharArray();
465         if (text[0] == '[') {
466             if (text[length-1] != ']') {
467                 throw new MalformedURIException("Missing closing ]");
468             }
469                             // trim [ and ] from ends of host
470             checkIPv6Address(host.substring(1, length-1));
471         }
472         else {
473             if (length > 255) {
474                 throw new MalformedURIException("Host name too long: " + host);
475             }
476 
477             for (int i = 0; i < length; i++) {
478                 char c = text[i];
479                 if (c == '%') {
480                    try {
481                        if (!isHexDigit(text[i+1]) || !isHexDigit(text[i+2])) {
482                            throwMalformedURIException(host,
483                              "Bad percent escape sequence");
484                        }
485                    }
486                    catch (ArrayIndexOutOfBoundsException ex) {
487                        throwMalformedURIException(host,
488                          "Bad percent escape sequence");
489                    }
490                    i += 2;
491                 }
492                 else if (!isRegNameCharacter(c)) {
493                     throwMalformedURIException(host,
494                       "Illegal host character " + c
495                     );
496                 }
497             }
498         }
499     }
500 
501 
isRegNameCharacter(char c)502     private static boolean isRegNameCharacter(char c) {
503 
504         switch(c) {
505             case '!': return true;
506             case '"': return false;
507             case '#': return false;
508             case '$': return true;
509             case '%': return false; // checked separately
510             case '&': return true;
511             case '\'': return true;
512             case '(': return true;
513             case ')': return true;
514             case '*': return true;
515             case '+': return true;
516             case ',': return true;
517             case '-': return true;
518             case '.': return true;
519             case '/': return false;
520             case '0': return true;
521             case '1': return true;
522             case '2': return true;
523             case '3': return true;
524             case '4': return true;
525             case '5': return true;
526             case '6': return true;
527             case '7': return true;
528             case '8': return true;
529             case '9': return true;
530             case ':': return false;
531             case ';': return true;
532             case '<': return false;
533             case '=': return true;
534             case '>': return false;
535             case '?': return false;
536             case '@': return false;
537             case 'A': return true;
538             case 'B': return true;
539             case 'C': return true;
540             case 'D': return true;
541             case 'E': return true;
542             case 'F': return true;
543             case 'G': return true;
544             case 'H': return true;
545             case 'I': return true;
546             case 'J': return true;
547             case 'K': return true;
548             case 'L': return true;
549             case 'M': return true;
550             case 'N': return true;
551             case 'O': return true;
552             case 'P': return true;
553             case 'Q': return true;
554             case 'R': return true;
555             case 'S': return true;
556             case 'T': return true;
557             case 'U': return true;
558             case 'V': return true;
559             case 'W': return true;
560             case 'X': return true;
561             case 'Y': return true;
562             case 'Z': return true;
563             case '[': return false;
564             case '\\': return false;
565             case ']': return false;
566             case '^': return false;
567             case '_': return true;
568             case '`': return false;
569             case 'a': return true;
570             case 'b': return true;
571             case 'c': return true;
572             case 'd': return true;
573             case 'e': return true;
574             case 'f': return true;
575             case 'g': return true;
576             case 'h': return true;
577             case 'i': return true;
578             case 'j': return true;
579             case 'k': return true;
580             case 'l': return true;
581             case 'm': return true;
582             case 'n': return true;
583             case 'o': return true;
584             case 'p': return true;
585             case 'q': return true;
586             case 'r': return true;
587             case 's': return true;
588             case 't': return true;
589             case 'u': return true;
590             case 'v': return true;
591             case 'w': return true;
592             case 'x': return true;
593             case 'y': return true;
594             case 'z': return true;
595             case '{': return false;
596             case '|': return false;
597             case '}': return false;
598             case '~': return true;
599         }
600         return false;
601 
602     }
603 
604 
checkPort(String port)605     private static void checkPort(String port) {
606 
607         for (int i = port.length()-1; i >= 0; i--) {
608             char c = port.charAt(i);
609             if (c < '0' || c > '9') {
610                 throw new MalformedURIException("Bad port: " + port);
611             }
612         }
613 
614     }
615 
616 
checkUserInfo(String userInfo)617     private static void checkUserInfo(String userInfo) {
618 
619         int length = userInfo.length();
620         for (int i = 0; i < length; i++) {
621             char c = userInfo.charAt(i);
622             if (c == '%') {
623                try {
624                    if (!isHexDigit(userInfo.charAt(i+1))
625                      || !isHexDigit(userInfo.charAt(i+2))) {
626                        throwMalformedURIException(userInfo,
627                          "Bad percent escape sequence");
628                    }
629                }
630                catch (StringIndexOutOfBoundsException ex) {
631                    throwMalformedURIException(userInfo,
632                      "Bad percent escape sequence");
633                }
634                i += 2;
635             }
636             else if (!isUserInfoCharacter(c)) {
637                 throw new MalformedURIException("Bad user info: " + userInfo);
638             }
639         }
640 
641     }
642 
643 
checkScheme(String scheme)644     private static void checkScheme(String scheme) {
645 
646         // http is probably 99% of cases so check it first
647         if ("http".equals(scheme)) return;
648 
649         if (scheme.length() == 0) {
650             throw new MalformedURIException("URIs cannot begin with a colon");
651         }
652         char c = scheme.charAt(0);
653         if (!isAlpha(c)) {
654             throw new MalformedURIException(
655               "Illegal initial scheme character " + c);
656         }
657 
658         for (int i = scheme.length()-1; i >= 1; i--) {
659             c = scheme.charAt(i);
660             if (!isSchemeCharacter(c)) {
661                 throw new MalformedURIException(
662                   "Illegal scheme character " + c
663                 );
664             }
665         }
666 
667     }
668 
669 
670     // http://www.faqs.org/rfcs/rfc2373.html
671     // http://www.faqs.org/rfcs/rfc2732.html
checkIPv6Address(String ip6Address)672     private static void checkIPv6Address(String ip6Address) {
673 
674         StringTokenizer st = new StringTokenizer(ip6Address, ":", true);
675         int numTokens = st.countTokens();
676         if (numTokens > 15 || numTokens < 2) {
677             throw new MalformedURIException(
678               "Illegal IPv6 host address: " + ip6Address
679             );
680         }
681         for (int i = 0; i < numTokens; i++) {
682             String hexPart = st.nextToken();
683             if (":".equals(hexPart)) continue;
684             try {
685                 int part = Integer.parseInt(hexPart, 16);
686                 if (part < 0) {
687                       throw new MalformedURIException(
688                       "Illegal IPv6 host address: " + ip6Address
689                     );
690                 }
691             }
692             catch (NumberFormatException ex) {
693                 if (i == numTokens-1) {
694                     checkIP4Address(hexPart, ip6Address);
695                 }
696                 else {
697                     throwMalformedURIException(ip6Address,
698                       "Illegal IPv6 host address: " + ip6Address
699                     );
700                 }
701             }
702         }
703 
704         if (ip6Address.indexOf("::") != ip6Address.lastIndexOf("::")) {
705             throw new MalformedURIException(
706               "Illegal IPv6 host address: " + ip6Address
707             );
708         }
709 
710     }
711 
712 
checkIP4Address(String address, String ip6Address)713     private static void checkIP4Address(String address, String ip6Address) {
714 
715         StringTokenizer st = new StringTokenizer(address, ".");
716         int numTokens = st.countTokens();
717         if (numTokens != 4) {
718             throw new MalformedURIException(
719               "Illegal IPv6 host address: " + ip6Address
720             );
721         }
722         for (int i = 0; i < 4; i++) {
723             String decPart = st.nextToken();
724             // https://github.com/elharo/xom/issues/12
725             if (decPart.startsWith("+")) {
726                 throw new MalformedURIException(
727                         "Illegal IPv6 host address: " + ip6Address
728                       );
729             }
730             try {
731                 int dec = Integer.parseInt(decPart);
732                 if (dec > 255 || dec < 0) {
733                     throw new MalformedURIException(
734                       "Illegal IPv6 host address: " + ip6Address
735                     );
736                 }
737             }
738             catch (NumberFormatException ex) {
739                 throw new MalformedURIException(
740                   "Illegal IPv6 host address: " + ip6Address
741                 );
742             }
743         }
744 
745     }
746 
747 
checkXMLName(String name)748     static void checkXMLName(String name) {
749 
750         if (name == null) {
751             throwIllegalNameException(name, "XML names cannot be null");
752         }
753 
754         int length = name.length();
755         if (length == 0) {
756             throwIllegalNameException(name, "XML names cannot be empty");
757         }
758 
759         char first = name.charAt(0);
760         if ((flags[first] & NAME_START_CHARACTER) == 0) {
761             throwIllegalNameException(name, "XML names cannot start " +
762               "with the character " + Integer.toHexString(first));
763         }
764 
765         for (int i = 1; i < length; i++) {
766             char c = name.charAt(i);
767             if ((flags[c] & NAME_CHARACTER) == 0) {
768                 throwIllegalNameException(name, "0x"
769                   + Integer.toHexString(c)
770                   + " is not a legal name character");
771             }
772         }
773 
774     }
775 
776 
777     private static boolean[] C0Table = new boolean[0x21];
778     static {
779         C0Table['\n'] = true;
780         C0Table['\r'] = true;
781         C0Table['\t'] = true;
782         C0Table[' '] = true;
783     }
784 
785 
isXMLSpaceCharacter(char c)786     static boolean isXMLSpaceCharacter(char c) {
787         if (c > ' ') return false;
788         return C0Table[c];
789     }
790 
791 
isHexDigit(char c)792     private static boolean isHexDigit(char c) {
793 
794         switch(c) {
795             case '0': return true;
796             case '1': return true;
797             case '2': return true;
798             case '3': return true;
799             case '4': return true;
800             case '5': return true;
801             case '6': return true;
802             case '7': return true;
803             case '8': return true;
804             case '9': return true;
805             case ':': return false;
806             case ';': return false;
807             case '<': return false;
808             case '=': return false;
809             case '>': return false;
810             case '?': return false;
811             case '@': return false;
812             case 'A': return true;
813             case 'B': return true;
814             case 'C': return true;
815             case 'D': return true;
816             case 'E': return true;
817             case 'F': return true;
818             case 'G': return false;
819             case 'H': return false;
820             case 'I': return false;
821             case 'J': return false;
822             case 'K': return false;
823             case 'L': return false;
824             case 'M': return false;
825             case 'N': return false;
826             case 'O': return false;
827             case 'P': return false;
828             case 'Q': return false;
829             case 'R': return false;
830             case 'S': return false;
831             case 'T': return false;
832             case 'U': return false;
833             case 'V': return false;
834             case 'W': return false;
835             case 'X': return false;
836             case 'Y': return false;
837             case 'Z': return false;
838             case '[': return false;
839             case '\\': return false;
840             case ']': return false;
841             case '^': return false;
842             case '_': return false;
843             case '`': return false;
844             case 'a': return true;
845             case 'b': return true;
846             case 'c': return true;
847             case 'd': return true;
848             case 'e': return true;
849             case 'f': return true;
850         }
851         return false;
852     }
853 
854 
855     // Since namespace URIs are commonly repeated, we can save a lot
856     // of redundant code by storing the ones we've seen before.
857     private static URICache cache = new URICache();
858 
859     private final static class URICache {
860 
861         private final static int LOAD = 6;
862         private String[] cache = new String[LOAD];
863         private int position = 0;
864 
contains(String s)865         synchronized boolean contains(String s) {
866 
867             for (int i = 0; i < LOAD; i++) {
868                 // Here I'm assuming the namespace URIs are interned.
869                 // This is commonly but not always true. This won't
870                 // break if they haven't been. Using equals() instead
871                 // of == is faster when the namespace URIs haven't been
872                 // interned but slower if they have.
873                 if (s == cache[i]) {
874                     return true;
875                 }
876             }
877             return false;
878 
879         }
880 
put(String s)881         synchronized void put(String s) {
882             cache[position] = s;
883             position++;
884             if (position == LOAD) position = 0;
885         }
886 
887     }
888 
889 
890     /**
891      * <p>
892      * Checks a string to see if it is an RFC 3986 absolute
893      * URI reference. URI references can contain fragment identifiers.
894      * Absolute URI references must have a scheme.
895      * </p>
896      *
897      * @param uri <code>String</code> to check
898      *
899      * @throws MalformedURIException if this is not a legal
900      *     URI reference
901      */
checkAbsoluteURIReference(String uri)902     static void checkAbsoluteURIReference(String uri) {
903 
904         if (cache.contains(uri)) {
905             return;
906         }
907         URIUtil.ParsedURI parsed = new URIUtil.ParsedURI(uri);
908         try {
909             if (parsed.scheme == null) {
910                 throwMalformedURIException(
911                   uri, "Missing scheme in absolute URI reference");
912             }
913             checkScheme(parsed.scheme);
914             if (parsed.authority != null) checkAuthority(parsed.authority);
915             checkPath(parsed.path);
916             if (parsed.fragment != null) checkFragment(parsed.fragment);
917             if (parsed.query != null) checkQuery(parsed.query);
918             cache.put(uri);
919         }
920         catch (MalformedURIException ex) {
921             ex.setData(uri);
922             throw ex;
923         }
924 
925     }
926 
927 
isAlpha(char c)928     static boolean isAlpha(char c) {
929 
930         switch(c) {
931             case 'A': return true;
932             case 'B': return true;
933             case 'C': return true;
934             case 'D': return true;
935             case 'E': return true;
936             case 'F': return true;
937             case 'G': return true;
938             case 'H': return true;
939             case 'I': return true;
940             case 'J': return true;
941             case 'K': return true;
942             case 'L': return true;
943             case 'M': return true;
944             case 'N': return true;
945             case 'O': return true;
946             case 'P': return true;
947             case 'Q': return true;
948             case 'R': return true;
949             case 'S': return true;
950             case 'T': return true;
951             case 'U': return true;
952             case 'V': return true;
953             case 'W': return true;
954             case 'X': return true;
955             case 'Y': return true;
956             case 'Z': return true;
957             case '[': return false;
958             case '\\': return false;
959             case ']': return false;
960             case '^': return false;
961             case '_': return false;
962             case '`': return false;
963             case 'a': return true;
964             case 'b': return true;
965             case 'c': return true;
966             case 'd': return true;
967             case 'e': return true;
968             case 'f': return true;
969             case 'g': return true;
970             case 'h': return true;
971             case 'i': return true;
972             case 'j': return true;
973             case 'k': return true;
974             case 'l': return true;
975             case 'm': return true;
976             case 'n': return true;
977             case 'o': return true;
978             case 'p': return true;
979             case 'q': return true;
980             case 'r': return true;
981             case 's': return true;
982             case 't': return true;
983             case 'u': return true;
984             case 'v': return true;
985             case 'w': return true;
986             case 'x': return true;
987             case 'y': return true;
988             case 'z': return true;
989         }
990 
991         return false;
992 
993     }
994 
995 
isSchemeCharacter(char c)996     static boolean isSchemeCharacter(char c) {
997 
998         /* The : and the ? cannot be reached here because they'll
999          * have been parsed out separately before this method is
1000          * called. They're included here strictly for alignment
1001          * so the compiler will generate a table lookup.
1002          */
1003         switch(c) {
1004             case '+': return true;
1005             case ',': return false;
1006             case '-': return true;
1007             case '.': return true;
1008             case '/': return false;
1009             case '0': return true;
1010             case '1': return true;
1011             case '2': return true;
1012             case '3': return true;
1013             case '4': return true;
1014             case '5': return true;
1015             case '6': return true;
1016             case '7': return true;
1017             case '8': return true;
1018             case '9': return true;
1019             case ':': return false;  // unreachable
1020             case ';': return false;
1021             case '<': return false;
1022             case '=': return false;
1023             case '>': return false;
1024             case '?': return false;  // unreachable
1025             case '@': return false;
1026             case 'A': return true;
1027             case 'B': return true;
1028             case 'C': return true;
1029             case 'D': return true;
1030             case 'E': return true;
1031             case 'F': return true;
1032             case 'G': return true;
1033             case 'H': return true;
1034             case 'I': return true;
1035             case 'J': return true;
1036             case 'K': return true;
1037             case 'L': return true;
1038             case 'M': return true;
1039             case 'N': return true;
1040             case 'O': return true;
1041             case 'P': return true;
1042             case 'Q': return true;
1043             case 'R': return true;
1044             case 'S': return true;
1045             case 'T': return true;
1046             case 'U': return true;
1047             case 'V': return true;
1048             case 'W': return true;
1049             case 'X': return true;
1050             case 'Y': return true;
1051             case 'Z': return true;
1052             case '[': return false;
1053             case '\\': return false;
1054             case ']': return false;
1055             case '^': return false;
1056             case '_': return false;
1057             case '`': return false;
1058             case 'a': return true;
1059             case 'b': return true;
1060             case 'c': return true;
1061             case 'd': return true;
1062             case 'e': return true;
1063             case 'f': return true;
1064             case 'g': return true;
1065             case 'h': return true;
1066             case 'i': return true;
1067             case 'j': return true;
1068             case 'k': return true;
1069             case 'l': return true;
1070             case 'm': return true;
1071             case 'n': return true;
1072             case 'o': return true;
1073             case 'p': return true;
1074             case 'q': return true;
1075             case 'r': return true;
1076             case 's': return true;
1077             case 't': return true;
1078             case 'u': return true;
1079             case 'v': return true;
1080             case 'w': return true;
1081             case 'x': return true;
1082             case 'y': return true;
1083             case 'z': return true;
1084         }
1085 
1086         return false;
1087 
1088     }
1089 
1090 
isPathCharacter(char c)1091     private static boolean isPathCharacter(char c) {
1092 
1093         switch(c) {
1094             case '!': return true;
1095             case '"': return false;
1096             case '#': return false;
1097             case '$': return true;
1098             case '%': return false; // checked separately
1099             case '&': return true;
1100             case '\'': return true;
1101             case '(': return true;
1102             case ')': return true;
1103             case '*': return true;
1104             case '+': return true;
1105             case ',': return true;
1106             case '-': return true;
1107             case '.': return true;
1108             case '/': return false; // handled separately
1109             case '0': return true;
1110             case '1': return true;
1111             case '2': return true;
1112             case '3': return true;
1113             case '4': return true;
1114             case '5': return true;
1115             case '6': return true;
1116             case '7': return true;
1117             case '8': return true;
1118             case '9': return true;
1119             case ':': return true;
1120             case ';': return true;
1121             case '<': return false;
1122             case '=': return true;
1123             case '>': return false;
1124             case '?': return false;
1125             case '@': return true;
1126             case 'A': return true;
1127             case 'B': return true;
1128             case 'C': return true;
1129             case 'D': return true;
1130             case 'E': return true;
1131             case 'F': return true;
1132             case 'G': return true;
1133             case 'H': return true;
1134             case 'I': return true;
1135             case 'J': return true;
1136             case 'K': return true;
1137             case 'L': return true;
1138             case 'M': return true;
1139             case 'N': return true;
1140             case 'O': return true;
1141             case 'P': return true;
1142             case 'Q': return true;
1143             case 'R': return true;
1144             case 'S': return true;
1145             case 'T': return true;
1146             case 'U': return true;
1147             case 'V': return true;
1148             case 'W': return true;
1149             case 'X': return true;
1150             case 'Y': return true;
1151             case 'Z': return true;
1152             case '[': return false;
1153             case '\\': return false;
1154             case ']': return false;
1155             case '^': return false;
1156             case '_': return true;
1157             case '`': return false;
1158             case 'a': return true;
1159             case 'b': return true;
1160             case 'c': return true;
1161             case 'd': return true;
1162             case 'e': return true;
1163             case 'f': return true;
1164             case 'g': return true;
1165             case 'h': return true;
1166             case 'i': return true;
1167             case 'j': return true;
1168             case 'k': return true;
1169             case 'l': return true;
1170             case 'm': return true;
1171             case 'n': return true;
1172             case 'o': return true;
1173             case 'p': return true;
1174             case 'q': return true;
1175             case 'r': return true;
1176             case 's': return true;
1177             case 't': return true;
1178             case 'u': return true;
1179             case 'v': return true;
1180             case 'w': return true;
1181             case 'x': return true;
1182             case 'y': return true;
1183             case 'z': return true;
1184             case '{': return false;
1185             case '|': return false;
1186             case '}': return false;
1187             case '~': return true;
1188         }
1189 
1190         return false;
1191 
1192     }
1193 
1194 
isUserInfoCharacter(char c)1195     private static boolean isUserInfoCharacter(char c) {
1196 
1197         switch(c) {
1198             case '!': return true;
1199             case '"': return false;
1200             case '#': return false;
1201             case '$': return true;
1202             case '%': return false; // checked separately
1203             case '&': return true;
1204             case '\'': return true;
1205             case '(': return true;
1206             case ')': return true;
1207             case '*': return true;
1208             case '+': return true;
1209             case ',': return true;
1210             case '-': return true;
1211             case '.': return true;
1212             case '/': return true;
1213             case '0': return true;
1214             case '1': return true;
1215             case '2': return true;
1216             case '3': return true;
1217             case '4': return true;
1218             case '5': return true;
1219             case '6': return true;
1220             case '7': return true;
1221             case '8': return true;
1222             case '9': return true;
1223             case ':': return true;
1224             case ';': return true;
1225             case '<': return false;
1226             case '=': return true;
1227             case '>': return false;
1228             case '?': return false;
1229             case '@': return false;
1230             case 'A': return true;
1231             case 'B': return true;
1232             case 'C': return true;
1233             case 'D': return true;
1234             case 'E': return true;
1235             case 'F': return true;
1236             case 'G': return true;
1237             case 'H': return true;
1238             case 'I': return true;
1239             case 'J': return true;
1240             case 'K': return true;
1241             case 'L': return true;
1242             case 'M': return true;
1243             case 'N': return true;
1244             case 'O': return true;
1245             case 'P': return true;
1246             case 'Q': return true;
1247             case 'R': return true;
1248             case 'S': return true;
1249             case 'T': return true;
1250             case 'U': return true;
1251             case 'V': return true;
1252             case 'W': return true;
1253             case 'X': return true;
1254             case 'Y': return true;
1255             case 'Z': return true;
1256             case '[': return false;
1257             case '\\': return false;
1258             case ']': return false;
1259             case '^': return false;
1260             case '_': return true;
1261             case '`': return false;
1262             case 'a': return true;
1263             case 'b': return true;
1264             case 'c': return true;
1265             case 'd': return true;
1266             case 'e': return true;
1267             case 'f': return true;
1268             case 'g': return true;
1269             case 'h': return true;
1270             case 'i': return true;
1271             case 'j': return true;
1272             case 'k': return true;
1273             case 'l': return true;
1274             case 'm': return true;
1275             case 'n': return true;
1276             case 'o': return true;
1277             case 'p': return true;
1278             case 'q': return true;
1279             case 'r': return true;
1280             case 's': return true;
1281             case 't': return true;
1282             case 'u': return true;
1283             case 'v': return true;
1284             case 'w': return true;
1285             case 'x': return true;
1286             case 'y': return true;
1287             case 'z': return true;
1288             case '{': return false;
1289             case '|': return false;
1290             case '}': return false;
1291             case '~': return true;
1292         }
1293 
1294         return false;
1295 
1296     }
1297 
1298 
1299     /**
1300      * Check to see that this string is an absolute URI,
1301      * neither a relative URI nor a URI reference.
1302      *
1303      */
checkAbsoluteURI(String uri)1304     static void checkAbsoluteURI(String uri) {
1305 
1306         URIUtil.ParsedURI parsed = new URIUtil.ParsedURI(uri);
1307         try {
1308             if (parsed.scheme == null) {
1309                 throwMalformedURIException(uri, "Missing scheme in absolute URI");
1310             }
1311             checkScheme(parsed.scheme);
1312             if (parsed.authority != null) checkAuthority(parsed.authority);
1313             checkPath(parsed.path);
1314             if (parsed.fragment != null) {
1315                 throwMalformedURIException(uri, "URIs cannot have fragment identifiers");
1316             }
1317             if (parsed.query != null) checkQuery(parsed.query);
1318         }
1319         catch (MalformedURIException ex) {
1320             ex.setData(uri);
1321             throw ex;
1322         }
1323 
1324     }
1325 
1326 
1327     // For use in checking internal DTD subsets
1328     private static XMLReader parser;
1329 
checkInternalDTDSubset(String subset)1330     static synchronized void checkInternalDTDSubset(String subset) {
1331 
1332         if (parser == null) {
1333             final InputSource empty = new InputSource(new EmptyReader());
1334             parser = Builder.findParser(false);
1335             // parser = new org.apache.crimson.parser.XMLReaderImpl();
1336             // Now let's stop this parser from loading any external
1337             // entities the subset references
1338             parser.setEntityResolver(new EntityResolver() {
1339 
1340                 public InputSource resolveEntity(String publicID, String systemID) {
1341                     return empty;
1342                 }
1343 
1344             });
1345         }
1346 
1347         String doc = "<!DOCTYPE a [" + subset + "]><a/>";
1348         try {
1349             InputSource source = new InputSource(new StringReader(doc));
1350             // just to make sure relative URLs can be resolved; don't
1351             // actually need to connect to this; the EntityResolver
1352             // prevents that
1353             source.setSystemId("http://www.example.org/");
1354             parser.parse(source);
1355         }
1356         catch (SAXException ex) {
1357             IllegalDataException idex = new IllegalDataException(
1358               "Malformed internal DTD subset: " + ex.getMessage(), ex);
1359             idex.setData(subset);
1360             throw idex;
1361         }
1362         catch (IOException ex) {
1363             throw new RuntimeException("BUG: I don't think this can happen");
1364         }
1365 
1366     }
1367 
1368 
1369     // A reader that immediately returns end of stream. This is a great
1370     // big hack to avoid reading anything when setting the internal
1371     // DTD subset. I could use the
1372     // http://xml.org/sax/features/external-parameter-entities SAX
1373     // feature, but many  parsers don't reliably implement that so
1374     // instead we simply pretend that all URLs point to empty files.
1375     private static class EmptyReader extends Reader {
1376 
read(char[] text, int start, int length)1377         public int read(char[] text, int start, int length) throws IOException {
1378             return -1;
1379         }
1380 
close()1381         public void close() {}
1382 
1383     }
1384 
1385 
1386 }