1 /* XPathTokenizer.java --
2    Copyright (C) 2004 Free Software Foundation, Inc.
3 
4 This file is part of GNU Classpath.
5 
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10 
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING.  If not, write to the
18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA.
20 
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library.  Thus, the terms and
23 conditions of the GNU General Public License cover the whole
24 combination.
25 
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module.  An independent module is a module which is not derived from
33 or based on this library.  If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so.  If you do not wish to do so, delete this
36 exception statement from your version. */
37 
38 package gnu.xml.xpath;
39 
40 import gnu.java.lang.CPStringBuilder;
41 
42 import java.io.BufferedReader;
43 import java.io.IOException;
44 import java.io.Reader;
45 import java.io.StringReader;
46 import java.util.Map;
47 import java.util.TreeMap;
48 
49 /*import antlr.Token;
50 import antlr.TokenStream;
51 import antlr.TokenStreamException;
52 import antlr.TokenStreamIOException;*/
53 
54 /**
55  * XPath 1.0 expression tokenizer.
56  *
57  * @author <a href='mailto:dog@gnu.org'>Chris Burdess</a>
58  */
59 public class XPathTokenizer
60 implements XPathParser.yyInput
61 //implements TokenStream
62 {
63 
64   static class XPathToken
65   //extends Token
66   {
67 
68     int type;
69     String val;
70 
XPathToken(int type)71     XPathToken (int type)
72     {
73       this (type, null);
74     }
75 
XPathToken(int type, String val)76     XPathToken (int type, String val)
77     {
78       //super (type);
79       this.type = type;
80       this.val = val;
81     }
82 
getText()83     public String getText ()
84     {
85       return val;
86     }
87 
toString()88     public String toString ()
89     {
90       return val;
91     }
92 
93   }
94 
95   static final Map<String,Integer> keywords = new TreeMap<String,Integer> ();
96   static
97   {
98     keywords.put ("ancestor", new Integer (XPathParser.ANCESTOR));
99     keywords.put ("ancestor-or-self", new Integer (XPathParser.ANCESTOR_OR_SELF));
100     keywords.put ("attribute", new Integer (XPathParser.ATTRIBUTE));
101     keywords.put ("child", new Integer (XPathParser.CHILD));
102     keywords.put ("descendant", new Integer (XPathParser.DESCENDANT));
103     keywords.put ("descendant-or-self", new Integer (XPathParser.DESCENDANT_OR_SELF));
104     keywords.put ("following", new Integer (XPathParser.FOLLOWING));
105     keywords.put ("following-sibling", new Integer (XPathParser.FOLLOWING_SIBLING));
106     keywords.put ("namespace", new Integer (XPathParser.NAMESPACE));
107     keywords.put ("parent", new Integer (XPathParser.PARENT));
108     keywords.put ("preceding", new Integer (XPathParser.PRECEDING));
109     keywords.put ("preceding-sibling", new Integer (XPathParser.PRECEDING_SIBLING));
110     keywords.put ("self", new Integer (XPathParser.SELF));
111     keywords.put ("div", new Integer (XPathParser.DIV));
112     keywords.put ("mod", new Integer (XPathParser.MOD));
113     keywords.put ("or", new Integer (XPathParser.OR));
114     keywords.put ("and", new Integer (XPathParser.AND));
115     keywords.put ("comment", new Integer (XPathParser.COMMENT));
116     keywords.put ("processing-instruction", new Integer (XPathParser.PROCESSING_INSTRUCTION));
117     keywords.put ("text", new Integer (XPathParser.TEXT));
118     keywords.put ("node", new Integer (XPathParser.NODE));
119   }
120 
121   Reader in;
122   XPathToken token;
123   XPathToken lastToken;
124 
XPathTokenizer(String expr)125   public XPathTokenizer (String expr)
126   {
127     this (new StringReader (expr));
128   }
129 
XPathTokenizer(Reader in)130   XPathTokenizer (Reader in)
131   {
132     this.in = in.markSupported () ? in : new BufferedReader (in);
133   }
134 
135   /* Begin ANTLR specific *
136 
137   public Token nextToken ()
138     throws TokenStreamException
139   {
140     try
141       {
142         if (!advance ())
143           {
144             throw new TokenStreamException ("eof");
145           }
146         token ();
147         return token;
148       }
149     catch (IOException e)
150       {
151         throw new TokenStreamIOException (e);
152       }
153   }
154 
155   * End ANTLR specific */
156 
advance()157   public boolean advance ()
158     throws IOException
159   {
160     lastToken = token;
161     int c = in.read ();
162     switch (c)
163       {
164       case -1: // eof
165         return false;
166       case 0x20:
167       case 0x09:
168       case 0x0d:
169       case 0x0a: // skip whitespace
170         return advance ();
171       case 0x22: // "
172       case 0x27: // '
173         token = consume_literal (c);
174         break;
175       case 0x28: // (
176         token = new XPathToken (XPathParser.LP);
177         break;
178       case 0x29: // )
179         token = new XPathToken (XPathParser.RP);
180         break;
181       case 0x5b: // [
182         token = new XPathToken (XPathParser.LB);
183         break;
184       case 0x5d: // ]
185         token = new XPathToken (XPathParser.RB);
186         break;
187       case 0x2c: // ,
188         token = new XPathToken (XPathParser.COMMA);
189         break;
190       case 0x7c: // |
191         token = new XPathToken (XPathParser.PIPE);
192         break;
193       case 0x2f: // /
194         in.mark (1);
195         int d1 = in.read ();
196         if (d1 == 0x2f)
197           {
198             token = new XPathToken (XPathParser.DOUBLE_SLASH);
199           }
200         else
201           {
202             in.reset ();
203             token = new XPathToken (XPathParser.SLASH);
204           }
205         break;
206       case 0x3d: // =
207         token = new XPathToken (XPathParser.EQ);
208         break;
209       case 0x21: // !
210         in.mark (1);
211         int d2 = in.read ();
212         if (d2 == 0x3d) // =
213           {
214             token = new XPathToken (XPathParser.NE);
215           }
216         else
217           {
218             in.reset ();
219             token = new XPathToken (XPathParser.yyErrorCode);
220           }
221         break;
222       case 0x3e: // >
223         in.mark (1);
224         int d3 = in.read ();
225         if (d3 == 0x3d) // =
226           {
227             token = new XPathToken (XPathParser.GTE);
228           }
229         else
230           {
231             in.reset ();
232             token = new XPathToken (XPathParser.GT);
233           }
234         break;
235       case 0x3c: // <
236         in.mark (1);
237         int d4 = in.read ();
238         if (d4 == 0x3d) // =
239           {
240             token = new XPathToken (XPathParser.LTE);
241           }
242         else
243           {
244             in.reset ();
245             token = new XPathToken (XPathParser.LT);
246           }
247         break;
248       case 0x2b: // +
249         token = new XPathToken (XPathParser.PLUS);
250         break;
251       case 0x2d: // -
252         token = new XPathToken (XPathParser.MINUS);
253         break;
254       case 0x40: // @
255         token = new XPathToken (XPathParser.AT);
256         break;
257       case 0x2a: // *
258         token = new XPathToken (XPathParser.STAR);
259         break;
260       case 0x24: // $
261         token = new XPathToken (XPathParser.DOLLAR);
262         break;
263       case 0x3a: // :
264         in.mark (1);
265         int d5 = in.read ();
266         if (d5 == 0x3a)
267           {
268             token = new XPathToken (XPathParser.DOUBLE_COLON);
269           }
270         else
271           {
272             in.reset ();
273             token = new XPathToken (XPathParser.COLON);
274           }
275         break;
276       case 0x2e: // .
277         in.mark (1);
278         int d6 = in.read ();
279         if (d6 == 0x2e)
280           {
281             token = new XPathToken (XPathParser.DOUBLE_DOT);
282           }
283         else
284           {
285             in.reset ();
286             token = new XPathToken (XPathParser.DOT);
287           }
288         break;
289       default:
290         if (c >= 0x30 && c <= 0x39)
291           {
292             token = consume_digits (c);
293           }
294         else if (c == 0x5f || Character.isLetter ((char) c))
295           {
296             token = consume_name (c);
297           }
298         else
299           {
300             token = new XPathToken (XPathParser.yyErrorCode);
301           }
302       }
303     return true;
304   }
305 
token()306   public int token ()
307   {
308     return token.type;
309   }
310 
value()311   public Object value ()
312   {
313     return token.val;
314   }
315 
consume_literal(int delimiter)316   XPathToken consume_literal (int delimiter)
317     throws IOException
318   {
319     CPStringBuilder buf = new CPStringBuilder ();
320     while (true)
321       {
322         int c = in.read ();
323         if (c == -1)
324           {
325             return new XPathToken (XPathParser.yyErrorCode);
326           }
327         else if (c == delimiter)
328           {
329             return new XPathToken (XPathParser.LITERAL, buf.toString ());
330           }
331         else
332           {
333             buf.append ((char) c);
334           }
335       }
336   }
337 
consume_digits(int c)338   XPathToken consume_digits (int c)
339     throws IOException
340   {
341     CPStringBuilder buf = new CPStringBuilder ();
342     buf.append ((char) c);
343     while (true)
344       {
345         in.mark (1);
346         c = in.read ();
347         if (c >= 0x30 && c <= 0x39)
348           {
349             buf.append ((char) c);
350           }
351         else
352           {
353             in.reset ();
354             return new XPathToken (XPathParser.DIGITS, buf.toString ());
355           }
356       }
357   }
358 
consume_name(int c)359   XPathToken consume_name (int c)
360     throws IOException
361   {
362     CPStringBuilder buf = new CPStringBuilder ();
363     buf.append ((char) c);
364     while (true)
365       {
366         in.mark (1);
367         c = in.read ();
368         if (isNameChar (c))
369           {
370             buf.append ((char) c);
371           }
372         else
373           {
374             in.reset ();
375             String name = buf.toString ();
376             Integer keyword = (Integer) keywords.get (name);
377             if (keyword == null)
378               {
379                 return new XPathToken (XPathParser.NAME, name);
380               }
381             else
382               {
383                 int val = keyword.intValue ();
384                 switch (val)
385                   {
386                   case XPathParser.NODE:
387                   case XPathParser.COMMENT:
388                   case XPathParser.TEXT:
389                   case XPathParser.PROCESSING_INSTRUCTION:
390                     // Consume subsequent (
391                     in.mark (1);
392                     do
393                       {
394                         c = in.read ();
395                       }
396                     while (c == 0x20 || c == 0x09);
397                     if (c != 0x28)
398                       {
399                         in.reset ();
400                         return new XPathToken (XPathParser.NAME, name);
401                       }
402                     break;
403                   case XPathParser.CHILD:
404                   case XPathParser.PARENT:
405                   case XPathParser.SELF:
406                   case XPathParser.DESCENDANT:
407                   case XPathParser.ANCESTOR:
408                   case XPathParser.DESCENDANT_OR_SELF:
409                   case XPathParser.ANCESTOR_OR_SELF:
410                   case XPathParser.ATTRIBUTE:
411                   case XPathParser.NAMESPACE:
412                   case XPathParser.FOLLOWING:
413                   case XPathParser.FOLLOWING_SIBLING:
414                   case XPathParser.PRECEDING:
415                   case XPathParser.PRECEDING_SIBLING:
416                     // Check that this is an axis specifier
417                     in.mark(1);
418                     do
419                       {
420                         c = in.read();
421                       }
422                     while (c == 0x20 || c == 0x09);
423                     if (c == 0x3a)
424                       {
425                         c = in.read();
426                         if (c == 0x3a)
427                           {
428                             in.reset();
429                             return new XPathToken(val);
430                           }
431                       }
432                     in.reset();
433                     return new XPathToken(XPathParser.NAME, name);
434                   case XPathParser.DIV:
435                   case XPathParser.MOD:
436                     // May be a name
437                     if (lastToken == null)
438                       {
439                         return new XPathToken(XPathParser.NAME, name);
440                       }
441                     switch (lastToken.type)
442                       {
443                       case XPathParser.LP:
444                       case XPathParser.LB:
445                       case XPathParser.COMMA:
446                       case XPathParser.PIPE:
447                       case XPathParser.EQ:
448                       case XPathParser.NE:
449                       case XPathParser.GT:
450                       case XPathParser.LT:
451                       case XPathParser.GTE:
452                       case XPathParser.LTE:
453                       case XPathParser.PLUS:
454                       case XPathParser.MINUS:
455                       case XPathParser.STAR:
456                       case XPathParser.AT:
457                       case XPathParser.DOLLAR:
458                       case XPathParser.COLON:
459                       case XPathParser.DOUBLE_COLON:
460                       case XPathParser.DIV:
461                       case XPathParser.MOD:
462                       case XPathParser.OR:
463                       case XPathParser.AND:
464                       case XPathParser.SLASH:
465                         return new XPathToken(XPathParser.NAME, name);
466                       }
467                     break;
468                   }
469                 return new XPathToken (val);
470               }
471           }
472       }
473   }
474 
isNameChar(int c)475   boolean isNameChar (int c)
476   {
477     /* Name */
478     return (c == 0x5f
479             || c == 0x2d
480             || c == 0x2e
481             || (c >= 0x30 && c <= 0x39)
482             /* CombiningChar */
483             || (c >= 0x0300 && c <= 0x0345)
484             || (c >= 0x0360 && c <= 0x0361)
485             || (c >= 0x0483 && c <= 0x0486)
486             || (c >= 0x0591 && c <= 0x05A1)
487             || (c >= 0x05A3 && c <= 0x05B9)
488             || (c >= 0x05BB && c <= 0x05BD)
489             || c == 0x05BF
490             || (c >= 0x05C1 && c <= 0x05C2)
491             || c == 0x05C4
492             || (c >= 0x064B && c <= 0x0652)
493             || c == 0x0670
494             || (c >= 0x06D6 && c <= 0x06DC)
495             || (c >= 0x06DD && c <= 0x06DF)
496             || (c >= 0x06E0 && c <= 0x06E4)
497             || (c >= 0x06E7 && c <= 0x06E8)
498             || (c >= 0x06EA && c <= 0x06ED)
499             || (c >= 0x0901 && c <= 0x0903)
500             || c == 0x093C
501             || (c >= 0x093E && c <= 0x094C)
502             || c == 0x094D
503             || (c >= 0x0951 && c <= 0x0954)
504             || (c >= 0x0962 && c <= 0x0963)
505             || (c >= 0x0981 && c <= 0x0983)
506             || c == 0x09BC
507             || c == 0x09BE
508             || c == 0x09BF
509             || (c >= 0x09C0 && c <= 0x09C4)
510             || (c >= 0x09C7 && c <= 0x09C8)
511             || (c >= 0x09CB && c <= 0x09CD)
512             || c == 0x09D7
513             || (c >= 0x09E2 && c <= 0x09E3)
514             || c == 0x0A02
515             || c == 0x0A3C
516             || c == 0x0A3E
517             || c == 0x0A3F
518             || (c >= 0x0A40 && c <= 0x0A42)
519             || (c >= 0x0A47 && c <= 0x0A48)
520             || (c >= 0x0A4B && c <= 0x0A4D)
521             || (c >= 0x0A70 && c <= 0x0A71)
522             || (c >= 0x0A81 && c <= 0x0A83)
523             || c == 0x0ABC
524             || (c >= 0x0ABE && c <= 0x0AC5)
525             || (c >= 0x0AC7 && c <= 0x0AC9)
526             || (c >= 0x0ACB && c <= 0x0ACD)
527             || (c >= 0x0B01 && c <= 0x0B03)
528             || c == 0x0B3C
529             || (c >= 0x0B3E && c <= 0x0B43)
530             || (c >= 0x0B47 && c <= 0x0B48)
531             || (c >= 0x0B4B && c <= 0x0B4D)
532             || (c >= 0x0B56 && c <= 0x0B57)
533             || (c >= 0x0B82 && c <= 0x0B83)
534             || (c >= 0x0BBE && c <= 0x0BC2)
535             || (c >= 0x0BC6 && c <= 0x0BC8)
536             || (c >= 0x0BCA && c <= 0x0BCD)
537             || c == 0x0BD7
538             || (c >= 0x0C01 && c <= 0x0C03)
539             || (c >= 0x0C3E && c <= 0x0C44)
540             || (c >= 0x0C46 && c <= 0x0C48)
541             || (c >= 0x0C4A && c <= 0x0C4D)
542             || (c >= 0x0C55 && c <= 0x0C56)
543             || (c >= 0x0C82 && c <= 0x0C83)
544             || (c >= 0x0CBE && c <= 0x0CC4)
545             || (c >= 0x0CC6 && c <= 0x0CC8)
546             || (c >= 0x0CCA && c <= 0x0CCD)
547             || (c >= 0x0CD5 && c <= 0x0CD6)
548             || (c >= 0x0D02 && c <= 0x0D03)
549             || (c >= 0x0D3E && c <= 0x0D43)
550             || (c >= 0x0D46 && c <= 0x0D48)
551             || (c >= 0x0D4A && c <= 0x0D4D)
552             || c == 0x0D57
553             || c == 0x0E31
554             || (c >= 0x0E34 && c <= 0x0E3A)
555             || (c >= 0x0E47 && c <= 0x0E4E)
556             || c == 0x0EB1
557             || (c >= 0x0EB4 && c <= 0x0EB9)
558             || (c >= 0x0EBB && c <= 0x0EBC)
559             || (c >= 0x0EC8 && c <= 0x0ECD)
560             || (c >= 0x0F18 && c <= 0x0F19)
561             || c == 0x0F35
562             || c == 0x0F37
563             || c == 0x0F39
564             || c == 0x0F3E
565             || c == 0x0F3F
566             || (c >= 0x0F71 && c <= 0x0F84)
567             || (c >= 0x0F86 && c <= 0x0F8B)
568             || (c >= 0x0F90 && c <= 0x0F95)
569             || c == 0x0F97
570             || (c >= 0x0F99 && c <= 0x0FAD)
571             || (c >= 0x0FB1 && c <= 0x0FB7)
572             || c == 0x0FB9
573             || (c >= 0x20D0 && c <= 0x20DC)
574             || c == 0x20E1
575             || (c >= 0x302A && c <= 0x302F)
576             || c == 0x3099
577             || c == 0x309A
578             /* Extender */
579             || c == 0x00B7
580             || c == 0x02D0
581             || c == 0x02D1
582             || c == 0x0387
583             || c == 0x0640
584             || c == 0x0E46
585             || c == 0x0EC6
586             || c == 0x3005
587             || (c >= 0x3031 && c <= 0x3035)
588             || (c >= 0x309D && c <= 0x309E)
589             || (c >= 0x30FC && c <= 0x30FE)
590             /* Name */
591             || Character.isLetter ((char) c));
592   }
593 
594 }
595