1 /*
2  * Copyright 2005 Maksim Orlovich <maksim@kde.org>
3  * Copyright (C) 2006 Apple Computer, Inc.
4  * Copyright (C) 2007 Alexey Proskuryakov <ap@webkit.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  *
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include "config.h"
29 #include "XPathParser.h"
30 
31 #if ENABLE(XPATH)
32 
33 #include "ExceptionCode.h"
34 #include "XPathEvaluator.h"
35 #include "XPathException.h"
36 #include "XPathNSResolver.h"
37 #include "XPathStep.h"
38 #include <wtf/StdLibExtras.h>
39 #include <wtf/text/StringHash.h>
40 
41 int xpathyyparse(void*);
42 
43 using namespace WTF;
44 using namespace Unicode;
45 
46 namespace WebCore {
47 namespace XPath {
48 
49 class LocationPath;
50 
51 #include "XPathGrammar.h"
52 
53 Parser* Parser::currentParser = 0;
54 
55 enum XMLCat { NameStart, NameCont, NotPartOfName };
56 
57 typedef HashMap<String, Step::Axis> AxisNamesMap;
58 
charCat(UChar aChar)59 static XMLCat charCat(UChar aChar)
60 {
61     //### might need to add some special cases from the XML spec.
62 
63     if (aChar == '_')
64         return NameStart;
65 
66     if (aChar == '.' || aChar == '-')
67         return NameCont;
68     CharCategory category = Unicode::category(aChar);
69     if (category & (Letter_Uppercase | Letter_Lowercase | Letter_Other | Letter_Titlecase | Number_Letter))
70         return NameStart;
71     if (category & (Mark_NonSpacing | Mark_SpacingCombining | Mark_Enclosing | Letter_Modifier | Number_DecimalDigit))
72         return NameCont;
73     return NotPartOfName;
74 }
75 
setUpAxisNamesMap(AxisNamesMap & axisNames)76 static void setUpAxisNamesMap(AxisNamesMap& axisNames)
77 {
78     struct AxisName {
79         const char* name;
80         Step::Axis axis;
81     };
82     const AxisName axisNameList[] = {
83         { "ancestor", Step::AncestorAxis },
84         { "ancestor-or-self", Step::AncestorOrSelfAxis },
85         { "attribute", Step::AttributeAxis },
86         { "child", Step::ChildAxis },
87         { "descendant", Step::DescendantAxis },
88         { "descendant-or-self", Step::DescendantOrSelfAxis },
89         { "following", Step::FollowingAxis },
90         { "following-sibling", Step::FollowingSiblingAxis },
91         { "namespace", Step::NamespaceAxis },
92         { "parent", Step::ParentAxis },
93         { "preceding", Step::PrecedingAxis },
94         { "preceding-sibling", Step::PrecedingSiblingAxis },
95         { "self", Step::SelfAxis }
96     };
97     for (unsigned i = 0; i < sizeof(axisNameList) / sizeof(axisNameList[0]); ++i)
98         axisNames.set(axisNameList[i].name, axisNameList[i].axis);
99 }
100 
isAxisName(const String & name,Step::Axis & type)101 static bool isAxisName(const String& name, Step::Axis& type)
102 {
103     DEFINE_STATIC_LOCAL(AxisNamesMap, axisNames, ());
104 
105     if (axisNames.isEmpty())
106         setUpAxisNamesMap(axisNames);
107 
108     AxisNamesMap::iterator it = axisNames.find(name);
109     if (it == axisNames.end())
110         return false;
111     type = it->second;
112     return true;
113 }
114 
isNodeTypeName(const String & name)115 static bool isNodeTypeName(const String& name)
116 {
117     DEFINE_STATIC_LOCAL(HashSet<String>, nodeTypeNames, ());
118     if (nodeTypeNames.isEmpty()) {
119         nodeTypeNames.add("comment");
120         nodeTypeNames.add("text");
121         nodeTypeNames.add("processing-instruction");
122         nodeTypeNames.add("node");
123     }
124     return nodeTypeNames.contains(name);
125 }
126 
127 // Returns whether the current token can possibly be a binary operator, given
128 // the previous token. Necessary to disambiguate some of the operators
129 // (* (multiply), div, and, or, mod) in the [32] Operator rule
130 // (check http://www.w3.org/TR/xpath#exprlex).
isBinaryOperatorContext() const131 bool Parser::isBinaryOperatorContext() const
132 {
133     switch (m_lastTokenType) {
134     case 0:
135     case '@': case AXISNAME: case '(': case '[': case ',':
136     case AND: case OR: case MULOP:
137     case '/': case SLASHSLASH: case '|': case PLUS: case MINUS:
138     case EQOP: case RELOP:
139         return false;
140     default:
141         return true;
142     }
143 }
144 
skipWS()145 void Parser::skipWS()
146 {
147     while (m_nextPos < m_data.length() && isSpaceOrNewline(m_data[m_nextPos]))
148         ++m_nextPos;
149 }
150 
makeTokenAndAdvance(int code,int advance)151 Token Parser::makeTokenAndAdvance(int code, int advance)
152 {
153     m_nextPos += advance;
154     return Token(code);
155 }
156 
makeTokenAndAdvance(int code,NumericOp::Opcode val,int advance)157 Token Parser::makeTokenAndAdvance(int code, NumericOp::Opcode val, int advance)
158 {
159     m_nextPos += advance;
160     return Token(code, val);
161 }
162 
makeTokenAndAdvance(int code,EqTestOp::Opcode val,int advance)163 Token Parser::makeTokenAndAdvance(int code, EqTestOp::Opcode val, int advance)
164 {
165     m_nextPos += advance;
166     return Token(code, val);
167 }
168 
169 // Returns next char if it's there and interesting, 0 otherwise
peekAheadHelper()170 char Parser::peekAheadHelper()
171 {
172     if (m_nextPos + 1 >= m_data.length())
173         return 0;
174     UChar next = m_data[m_nextPos + 1];
175     if (next >= 0xff)
176         return 0;
177     return next;
178 }
179 
peekCurHelper()180 char Parser::peekCurHelper()
181 {
182     if (m_nextPos >= m_data.length())
183         return 0;
184     UChar next = m_data[m_nextPos];
185     if (next >= 0xff)
186         return 0;
187     return next;
188 }
189 
lexString()190 Token Parser::lexString()
191 {
192     UChar delimiter = m_data[m_nextPos];
193     int startPos = m_nextPos + 1;
194 
195     for (m_nextPos = startPos; m_nextPos < m_data.length(); ++m_nextPos) {
196         if (m_data[m_nextPos] == delimiter) {
197             String value = m_data.substring(startPos, m_nextPos - startPos);
198             if (value.isNull())
199                 value = "";
200             ++m_nextPos; // Consume the char.
201             return Token(LITERAL, value);
202         }
203     }
204 
205     // Ouch, went off the end -- report error.
206     return Token(XPATH_ERROR);
207 }
208 
lexNumber()209 Token Parser::lexNumber()
210 {
211     int startPos = m_nextPos;
212     bool seenDot = false;
213 
214     // Go until end or a non-digits character.
215     for (; m_nextPos < m_data.length(); ++m_nextPos) {
216         UChar aChar = m_data[m_nextPos];
217         if (aChar >= 0xff) break;
218 
219         if (aChar < '0' || aChar > '9') {
220             if (aChar == '.' && !seenDot)
221                 seenDot = true;
222             else
223                 break;
224         }
225     }
226 
227     return Token(NUMBER, m_data.substring(startPos, m_nextPos - startPos));
228 }
229 
lexNCName(String & name)230 bool Parser::lexNCName(String& name)
231 {
232     int startPos = m_nextPos;
233     if (m_nextPos >= m_data.length())
234         return false;
235 
236     if (charCat(m_data[m_nextPos]) != NameStart)
237         return false;
238 
239     // Keep going until we get a character that's not good for names.
240     for (; m_nextPos < m_data.length(); ++m_nextPos)
241         if (charCat(m_data[m_nextPos]) == NotPartOfName)
242             break;
243 
244     name = m_data.substring(startPos, m_nextPos - startPos);
245     return true;
246 }
247 
lexQName(String & name)248 bool Parser::lexQName(String& name)
249 {
250     String n1;
251     if (!lexNCName(n1))
252         return false;
253 
254     skipWS();
255 
256     // If the next character is :, what we just got it the prefix, if not,
257     // it's the whole thing.
258     if (peekAheadHelper() != ':') {
259         name = n1;
260         return true;
261     }
262 
263     String n2;
264     if (!lexNCName(n2))
265         return false;
266 
267     name = n1 + ":" + n2;
268     return true;
269 }
270 
nextTokenInternal()271 Token Parser::nextTokenInternal()
272 {
273     skipWS();
274 
275     if (m_nextPos >= m_data.length())
276         return Token(0);
277 
278     char code = peekCurHelper();
279     switch (code) {
280     case '(': case ')': case '[': case ']':
281     case '@': case ',': case '|':
282         return makeTokenAndAdvance(code);
283     case '\'':
284     case '\"':
285         return lexString();
286     case '0': case '1': case '2': case '3': case '4':
287     case '5': case '6': case '7': case '8': case '9':
288         return lexNumber();
289     case '.': {
290         char next = peekAheadHelper();
291         if (next == '.')
292             return makeTokenAndAdvance(DOTDOT, 2);
293         if (next >= '0' && next <= '9')
294             return lexNumber();
295         return makeTokenAndAdvance('.');
296     }
297     case '/':
298         if (peekAheadHelper() == '/')
299             return makeTokenAndAdvance(SLASHSLASH, 2);
300         return makeTokenAndAdvance('/');
301     case '+':
302         return makeTokenAndAdvance(PLUS);
303     case '-':
304         return makeTokenAndAdvance(MINUS);
305     case '=':
306         return makeTokenAndAdvance(EQOP, EqTestOp::OP_EQ);
307     case '!':
308         if (peekAheadHelper() == '=')
309             return makeTokenAndAdvance(EQOP, EqTestOp::OP_NE, 2);
310         return Token(XPATH_ERROR);
311     case '<':
312         if (peekAheadHelper() == '=')
313             return makeTokenAndAdvance(RELOP, EqTestOp::OP_LE, 2);
314         return makeTokenAndAdvance(RELOP, EqTestOp::OP_LT);
315     case '>':
316         if (peekAheadHelper() == '=')
317             return makeTokenAndAdvance(RELOP, EqTestOp::OP_GE, 2);
318         return makeTokenAndAdvance(RELOP, EqTestOp::OP_GT);
319     case '*':
320         if (isBinaryOperatorContext())
321             return makeTokenAndAdvance(MULOP, NumericOp::OP_Mul);
322         ++m_nextPos;
323         return Token(NAMETEST, "*");
324     case '$': { // $ QName
325         m_nextPos++;
326         String name;
327         if (!lexQName(name))
328             return Token(XPATH_ERROR);
329         return Token(VARIABLEREFERENCE, name);
330     }
331     }
332 
333     String name;
334     if (!lexNCName(name))
335         return Token(XPATH_ERROR);
336 
337     skipWS();
338     // If we're in an operator context, check for any operator names
339     if (isBinaryOperatorContext()) {
340         if (name == "and") //### hash?
341             return Token(AND);
342         if (name == "or")
343             return Token(OR);
344         if (name == "mod")
345             return Token(MULOP, NumericOp::OP_Mod);
346         if (name == "div")
347             return Token(MULOP, NumericOp::OP_Div);
348     }
349 
350     // See whether we are at a :
351     if (peekCurHelper() == ':') {
352         m_nextPos++;
353         // Any chance it's an axis name?
354         if (peekCurHelper() == ':') {
355             m_nextPos++;
356 
357             //It might be an axis name.
358             Step::Axis axis;
359             if (isAxisName(name, axis))
360                 return Token(AXISNAME, axis);
361             // Ugh, :: is only valid in axis names -> error
362             return Token(XPATH_ERROR);
363         }
364 
365         // Seems like this is a fully qualified qname, or perhaps the * modified one from NameTest
366         skipWS();
367         if (peekCurHelper() == '*') {
368             m_nextPos++;
369             return Token(NAMETEST, name + ":*");
370         }
371 
372         // Make a full qname.
373         String n2;
374         if (!lexNCName(n2))
375             return Token(XPATH_ERROR);
376 
377         name = name + ":" + n2;
378     }
379 
380     skipWS();
381     if (peekCurHelper() == '(') {
382         //note: we don't swallow the (here!
383 
384         //either node type of function name
385         if (isNodeTypeName(name)) {
386             if (name == "processing-instruction")
387                 return Token(PI, name);
388 
389             return Token(NODETYPE, name);
390         }
391         //must be a function name.
392         return Token(FUNCTIONNAME, name);
393     }
394 
395     // At this point, it must be NAMETEST.
396     return Token(NAMETEST, name);
397 }
398 
nextToken()399 Token Parser::nextToken()
400 {
401     Token toRet = nextTokenInternal();
402     m_lastTokenType = toRet.type;
403     return toRet;
404 }
405 
Parser()406 Parser::Parser()
407 {
408     reset(String());
409 }
410 
~Parser()411 Parser::~Parser()
412 {
413 }
414 
reset(const String & data)415 void Parser::reset(const String& data)
416 {
417     m_nextPos = 0;
418     m_data = data;
419     m_lastTokenType = 0;
420 
421     m_topExpr = 0;
422     m_gotNamespaceError = false;
423 }
424 
lex(void * data)425 int Parser::lex(void* data)
426 {
427     YYSTYPE* yylval = static_cast<YYSTYPE*>(data);
428     Token tok = nextToken();
429 
430     switch (tok.type) {
431     case AXISNAME:
432         yylval->axis = tok.axis;
433         break;
434     case MULOP:
435         yylval->numop = tok.numop;
436         break;
437     case RELOP:
438     case EQOP:
439         yylval->eqop = tok.eqop;
440         break;
441     case NODETYPE:
442     case PI:
443     case FUNCTIONNAME:
444     case LITERAL:
445     case VARIABLEREFERENCE:
446     case NUMBER:
447     case NAMETEST:
448         yylval->str = new String(tok.str);
449         registerString(yylval->str);
450         break;
451     }
452 
453     return tok.type;
454 }
455 
expandQName(const String & qName,String & localName,String & namespaceURI)456 bool Parser::expandQName(const String& qName, String& localName, String& namespaceURI)
457 {
458     size_t colon = qName.find(':');
459     if (colon != notFound) {
460         if (!m_resolver)
461             return false;
462         namespaceURI = m_resolver->lookupNamespaceURI(qName.left(colon));
463         if (namespaceURI.isNull())
464             return false;
465         localName = qName.substring(colon + 1);
466     } else
467         localName = qName;
468 
469     return true;
470 }
471 
parseStatement(const String & statement,PassRefPtr<XPathNSResolver> resolver,ExceptionCode & ec)472 Expression* Parser::parseStatement(const String& statement, PassRefPtr<XPathNSResolver> resolver, ExceptionCode& ec)
473 {
474     reset(statement);
475 
476     m_resolver = resolver;
477 
478     Parser* oldParser = currentParser;
479     currentParser = this;
480     int parseError = xpathyyparse(this);
481     currentParser = oldParser;
482 
483     if (parseError) {
484         deleteAllValues(m_parseNodes);
485         m_parseNodes.clear();
486 
487         HashSet<Vector<Predicate*>*>::iterator pend = m_predicateVectors.end();
488         for (HashSet<Vector<Predicate*>*>::iterator it = m_predicateVectors.begin(); it != pend; ++it) {
489             deleteAllValues(**it);
490             delete *it;
491         }
492         m_predicateVectors.clear();
493 
494         HashSet<Vector<Expression*>*>::iterator eend = m_expressionVectors.end();
495         for (HashSet<Vector<Expression*>*>::iterator it = m_expressionVectors.begin(); it != eend; ++it) {
496             deleteAllValues(**it);
497             delete *it;
498         }
499         m_expressionVectors.clear();
500 
501         deleteAllValues(m_strings);
502         m_strings.clear();
503 
504         deleteAllValues(m_nodeTests);
505         m_nodeTests.clear();
506 
507         m_topExpr = 0;
508 
509         if (m_gotNamespaceError)
510             ec = NAMESPACE_ERR;
511         else
512             ec = XPathException::INVALID_EXPRESSION_ERR;
513         return 0;
514     }
515 
516     ASSERT(m_parseNodes.size() == 1);
517     ASSERT(*m_parseNodes.begin() == m_topExpr);
518     ASSERT(m_expressionVectors.size() == 0);
519     ASSERT(m_predicateVectors.size() == 0);
520     ASSERT(m_strings.size() == 0);
521     ASSERT(m_nodeTests.size() == 0);
522 
523     m_parseNodes.clear();
524     Expression* result = m_topExpr;
525     m_topExpr = 0;
526 
527     return result;
528 }
529 
registerParseNode(ParseNode * node)530 void Parser::registerParseNode(ParseNode* node)
531 {
532     if (node == 0)
533         return;
534 
535     ASSERT(!m_parseNodes.contains(node));
536 
537     m_parseNodes.add(node);
538 }
539 
unregisterParseNode(ParseNode * node)540 void Parser::unregisterParseNode(ParseNode* node)
541 {
542     if (node == 0)
543         return;
544 
545     ASSERT(m_parseNodes.contains(node));
546 
547     m_parseNodes.remove(node);
548 }
549 
registerPredicateVector(Vector<Predicate * > * vector)550 void Parser::registerPredicateVector(Vector<Predicate*>* vector)
551 {
552     if (vector == 0)
553         return;
554 
555     ASSERT(!m_predicateVectors.contains(vector));
556 
557     m_predicateVectors.add(vector);
558 }
559 
deletePredicateVector(Vector<Predicate * > * vector)560 void Parser::deletePredicateVector(Vector<Predicate*>* vector)
561 {
562     if (vector == 0)
563         return;
564 
565     ASSERT(m_predicateVectors.contains(vector));
566 
567     m_predicateVectors.remove(vector);
568     delete vector;
569 }
570 
571 
registerExpressionVector(Vector<Expression * > * vector)572 void Parser::registerExpressionVector(Vector<Expression*>* vector)
573 {
574     if (vector == 0)
575         return;
576 
577     ASSERT(!m_expressionVectors.contains(vector));
578 
579     m_expressionVectors.add(vector);
580 }
581 
deleteExpressionVector(Vector<Expression * > * vector)582 void Parser::deleteExpressionVector(Vector<Expression*>* vector)
583 {
584     if (vector == 0)
585         return;
586 
587     ASSERT(m_expressionVectors.contains(vector));
588 
589     m_expressionVectors.remove(vector);
590     delete vector;
591 }
592 
registerString(String * s)593 void Parser::registerString(String* s)
594 {
595     if (s == 0)
596         return;
597 
598     ASSERT(!m_strings.contains(s));
599 
600     m_strings.add(s);
601 }
602 
deleteString(String * s)603 void Parser::deleteString(String* s)
604 {
605     if (s == 0)
606         return;
607 
608     ASSERT(m_strings.contains(s));
609 
610     m_strings.remove(s);
611     delete s;
612 }
613 
registerNodeTest(Step::NodeTest * t)614 void Parser::registerNodeTest(Step::NodeTest* t)
615 {
616     if (t == 0)
617         return;
618 
619     ASSERT(!m_nodeTests.contains(t));
620 
621     m_nodeTests.add(t);
622 }
623 
deleteNodeTest(Step::NodeTest * t)624 void Parser::deleteNodeTest(Step::NodeTest* t)
625 {
626     if (t == 0)
627         return;
628 
629     ASSERT(m_nodeTests.contains(t));
630 
631     m_nodeTests.remove(t);
632     delete t;
633 }
634 
635 }
636 }
637 
638 #endif // ENABLE(XPATH)
639