1 //------------------------------------------------------------------------------ 2 // <copyright file="XPathScanner.cs" company="Microsoft"> 3 // Copyright (c) Microsoft Corporation. All rights reserved. 4 // </copyright> 5 // <owner current="true" primary="true">Microsoft</owner> 6 // <spec>http://www.w3.org/TR/xpath#exprlex</spec> 7 //------------------------------------------------------------------------------ 8 9 using System.Diagnostics; 10 11 namespace System.Xml.Xsl.XPath { 12 using Res = System.Xml.Utils.Res; 13 14 // Extends XPathOperator enumeration 15 internal enum LexKind { 16 Unknown, // Unknown lexeme 17 Or, // Operator 'or' 18 And, // Operator 'and' 19 Eq, // Operator '=' 20 Ne, // Operator '!=' 21 Lt, // Operator '<' 22 Le, // Operator '<=' 23 Gt, // Operator '>' 24 Ge, // Operator '>=' 25 Plus, // Operator '+' 26 Minus, // Operator '-' 27 Multiply, // Operator '*' 28 Divide, // Operator 'div' 29 Modulo, // Operator 'mod' 30 UnaryMinus, // Not used 31 Union, // Operator '|' 32 LastOperator = Union, 33 34 DotDot, // '..' 35 ColonColon, // '::' 36 SlashSlash, // Operator '//' 37 Number, // Number (numeric literal) 38 Axis, // AxisName 39 40 Name, // NameTest, NodeType, FunctionName, AxisName, second part of VariableReference 41 String, // Literal (string literal) 42 Eof, // End of the expression 43 44 FirstStringable = Name, 45 LastNonChar = Eof, 46 47 LParens = '(', 48 RParens = ')', 49 LBracket = '[', 50 RBracket = ']', 51 Dot = '.', 52 At = '@', 53 Comma = ',', 54 55 Star = '*', // NameTest 56 Slash = '/', // Operator '/' 57 Dollar = '$', // First part of VariableReference 58 RBrace = '}', // Used for AVTs 59 }; 60 61 internal sealed class XPathScanner { 62 private string xpathExpr; 63 private int curIndex; 64 private char curChar; 65 private LexKind kind; 66 private string name; 67 private string prefix; 68 private string stringValue; 69 private bool canBeFunction; 70 private int lexStart; 71 private int prevLexEnd; 72 private LexKind prevKind; 73 private XPathAxis axis; 74 75 private XmlCharType xmlCharType = XmlCharType.Instance; 76 XPathScanner(string xpathExpr)77 public XPathScanner(string xpathExpr) : this(xpathExpr, 0) {} 78 XPathScanner(string xpathExpr, int startFrom)79 public XPathScanner(string xpathExpr, int startFrom) { 80 Debug.Assert(xpathExpr != null); 81 this.xpathExpr = xpathExpr; 82 this.kind = LexKind.Unknown; 83 SetSourceIndex(startFrom); 84 NextLex(); 85 } 86 87 public string Source { get { return xpathExpr; } } 88 public LexKind Kind { get { return kind; } } 89 public int LexStart { get { return lexStart; } } 90 public int LexSize { get { return curIndex - lexStart; } } 91 public int PrevLexEnd { get { return prevLexEnd; } } 92 SetSourceIndex(int index)93 private void SetSourceIndex(int index) { 94 Debug.Assert(0 <= index && index <= xpathExpr.Length); 95 curIndex = index - 1; 96 NextChar(); 97 } 98 NextChar()99 private void NextChar() { 100 Debug.Assert(-1 <= curIndex && curIndex < xpathExpr.Length); 101 curIndex++; 102 if (curIndex < xpathExpr.Length) { 103 curChar = xpathExpr[curIndex]; 104 } else { 105 Debug.Assert(curIndex == xpathExpr.Length); 106 curChar = '\0'; 107 } 108 } 109 110 #if XML10_FIFTH_EDITION PeekNextChar()111 private char PeekNextChar() { 112 Debug.Assert(-1 <= curIndex && curIndex <= xpathExpr.Length); 113 if (curIndex + 1 < xpathExpr.Length) { 114 return xpathExpr[curIndex + 1]; 115 } 116 else { 117 return '\0'; 118 } 119 } 120 #endif 121 122 public string Name { 123 get { 124 Debug.Assert(kind == LexKind.Name); 125 Debug.Assert(name != null); 126 return name; 127 } 128 } 129 130 public string Prefix { 131 get { 132 Debug.Assert(kind == LexKind.Name); 133 Debug.Assert(prefix != null); 134 return prefix; 135 } 136 } 137 138 public string RawValue { 139 get { 140 if (kind == LexKind.Eof) { 141 return LexKindToString(kind); 142 } else { 143 return xpathExpr.Substring(lexStart, curIndex - lexStart); 144 } 145 } 146 } 147 148 public string StringValue { 149 get { 150 Debug.Assert(kind == LexKind.String); 151 Debug.Assert(stringValue != null); 152 return stringValue; 153 } 154 } 155 156 // Returns true if the character following an QName (possibly after intervening 157 // ExprWhitespace) is '('. In this case the token must be recognized as a NodeType 158 // or a FunctionName unless it is an OperatorName. This distinction cannot be done 159 // without knowing the previous lexeme. For example, "or" in "... or (1 != 0)" may 160 // be an OperatorName or a FunctionName. 161 public bool CanBeFunction { 162 get { 163 Debug.Assert(kind == LexKind.Name); 164 return canBeFunction; 165 } 166 } 167 168 public XPathAxis Axis { 169 get { 170 Debug.Assert(kind == LexKind.Axis); 171 Debug.Assert(axis != XPathAxis.Unknown); 172 return axis; 173 } 174 } 175 SkipSpace()176 private void SkipSpace() { 177 while (xmlCharType.IsWhiteSpace(curChar)) { 178 NextChar(); 179 } 180 } 181 IsAsciiDigit(char ch)182 private static bool IsAsciiDigit(char ch) { 183 return (uint)(ch - '0') <= 9; 184 } 185 NextLex()186 public void NextLex() { 187 prevLexEnd = curIndex; 188 prevKind = kind; 189 SkipSpace(); 190 lexStart = curIndex; 191 192 switch (curChar) { 193 case '\0': 194 kind = LexKind.Eof; 195 return; 196 case '(': case ')': case '[': case ']': 197 case '@': case ',': case '$': case '}': 198 kind = (LexKind)curChar; 199 NextChar(); 200 break; 201 case '.': 202 NextChar(); 203 if (curChar == '.') { 204 kind = LexKind.DotDot; 205 NextChar(); 206 } else if (IsAsciiDigit(curChar)) { 207 SetSourceIndex(lexStart); 208 goto case '0'; 209 } else { 210 kind = LexKind.Dot; 211 } 212 break; 213 case ':': 214 NextChar(); 215 if (curChar == ':') { 216 kind = LexKind.ColonColon; 217 NextChar(); 218 } else { 219 kind = LexKind.Unknown; 220 } 221 break; 222 case '*': 223 kind = LexKind.Star; 224 NextChar(); 225 CheckOperator(true); 226 break; 227 case '/': 228 NextChar(); 229 if (curChar == '/') { 230 kind = LexKind.SlashSlash; 231 NextChar(); 232 } else { 233 kind = LexKind.Slash; 234 } 235 break; 236 case '|': 237 kind = LexKind.Union; 238 NextChar(); 239 break; 240 case '+': 241 kind = LexKind.Plus; 242 NextChar(); 243 break; 244 case '-': 245 kind = LexKind.Minus; 246 NextChar(); 247 break; 248 case '=': 249 kind = LexKind.Eq; 250 NextChar(); 251 break; 252 case '!': 253 NextChar(); 254 if (curChar == '=') { 255 kind = LexKind.Ne; 256 NextChar(); 257 } else { 258 kind = LexKind.Unknown; 259 } 260 break; 261 case '<': 262 NextChar(); 263 if (curChar == '=') { 264 kind = LexKind.Le; 265 NextChar(); 266 } else { 267 kind = LexKind.Lt; 268 } 269 break; 270 case '>': 271 NextChar(); 272 if (curChar == '=') { 273 kind = LexKind.Ge; 274 NextChar(); 275 } else { 276 kind = LexKind.Gt; 277 } 278 break; 279 case '"': 280 case '\'': 281 kind = LexKind.String; 282 ScanString(); 283 break; 284 case '0': case '1': case '2': case '3': 285 case '4': case '5': case '6': case '7': 286 case '8': case '9': 287 kind = LexKind.Number; 288 ScanNumber(); 289 break; 290 default: 291 if (xmlCharType.IsStartNCNameSingleChar(curChar) 292 #if XML10_FIFTH_EDITION 293 || xmlCharType.IsNCNameHighSurrogateChar(curChar) 294 #endif 295 ) { 296 kind = LexKind.Name; 297 this.name = ScanNCName(); 298 this.prefix = string.Empty; 299 this.canBeFunction = false; 300 this.axis = XPathAxis.Unknown; 301 bool colonColon = false; 302 int saveSourceIndex = curIndex; 303 304 // "foo:bar" or "foo:*" -- one lexeme (no spaces allowed) 305 // "foo::" or "foo ::" -- two lexemes, reported as one (AxisName) 306 // "foo:?" or "foo :?" -- lexeme "foo" reported 307 if (curChar == ':') { 308 NextChar(); 309 if (curChar == ':') { // "foo::" -> OperatorName, AxisName 310 NextChar(); 311 colonColon = true; 312 SetSourceIndex(saveSourceIndex); 313 } else { // "foo:bar", "foo:*" or "foo:?" 314 if (curChar == '*') { 315 NextChar(); 316 this.prefix = this.name; 317 this.name = "*"; 318 } else if (xmlCharType.IsStartNCNameSingleChar(curChar) 319 #if XML10_FIFTH_EDITION 320 || xmlCharType.IsNCNameHighSurrogateChar(curChar) 321 #endif 322 ) { 323 this.prefix = this.name; 324 this.name = ScanNCName(); 325 // Look ahead for '(' to determine whether QName can be a FunctionName 326 saveSourceIndex = curIndex; 327 SkipSpace(); 328 this.canBeFunction = (curChar == '('); 329 SetSourceIndex(saveSourceIndex); 330 } else { // "foo:?" -> OperatorName, NameTest 331 // Return "foo" and leave ":" to be reported later as an unknown lexeme 332 SetSourceIndex(saveSourceIndex); 333 } 334 } 335 } else { 336 SkipSpace(); 337 if (curChar == ':') { // "foo ::" or "foo :?" 338 NextChar(); 339 if (curChar == ':') { 340 NextChar(); 341 colonColon = true; 342 } 343 SetSourceIndex(saveSourceIndex); 344 } else { 345 this.canBeFunction = (curChar == '('); 346 } 347 } 348 if (!CheckOperator(false) && colonColon) { 349 this.axis = CheckAxis(); 350 } 351 } else { 352 kind = LexKind.Unknown; 353 NextChar(); 354 } 355 break; 356 } 357 } 358 CheckOperator(bool star)359 private bool CheckOperator(bool star) { 360 LexKind opKind; 361 362 if (star) { 363 opKind = LexKind.Multiply; 364 } else { 365 if (prefix.Length != 0 || name.Length > 3) 366 return false; 367 368 switch (name) { 369 case "or" : opKind = LexKind.Or; break; 370 case "and": opKind = LexKind.And; break; 371 case "div": opKind = LexKind.Divide; break; 372 case "mod": opKind = LexKind.Modulo; break; 373 default : return false; 374 } 375 } 376 377 // If there is a preceding token and the preceding token is not one of '@', '::', '(', '[', ',' or an Operator, 378 // then a '*' must be recognized as a MultiplyOperator and an NCName must be recognized as an OperatorName. 379 if (prevKind <= LexKind.LastOperator) 380 return false; 381 382 switch (prevKind) { 383 case LexKind.Slash: 384 case LexKind.SlashSlash: 385 case LexKind.At: 386 case LexKind.ColonColon: 387 case LexKind.LParens: 388 case LexKind.LBracket: 389 case LexKind.Comma: 390 case LexKind.Dollar: 391 return false; 392 } 393 394 this.kind = opKind; 395 return true; 396 } 397 CheckAxis()398 private XPathAxis CheckAxis() { 399 this.kind = LexKind.Axis; 400 switch (name) { 401 case "ancestor" : return XPathAxis.Ancestor; 402 case "ancestor-or-self" : return XPathAxis.AncestorOrSelf; 403 case "attribute" : return XPathAxis.Attribute; 404 case "child" : return XPathAxis.Child; 405 case "descendant" : return XPathAxis.Descendant; 406 case "descendant-or-self" : return XPathAxis.DescendantOrSelf; 407 case "following" : return XPathAxis.Following; 408 case "following-sibling" : return XPathAxis.FollowingSibling; 409 case "namespace" : return XPathAxis.Namespace; 410 case "parent" : return XPathAxis.Parent; 411 case "preceding" : return XPathAxis.Preceding; 412 case "preceding-sibling" : return XPathAxis.PrecedingSibling; 413 case "self" : return XPathAxis.Self; 414 default : this.kind = LexKind.Name; return XPathAxis.Unknown; 415 } 416 } 417 ScanNumber()418 private void ScanNumber() { 419 Debug.Assert(IsAsciiDigit(curChar) || curChar == '.'); 420 while (IsAsciiDigit(curChar)) { 421 NextChar(); 422 } 423 if (curChar == '.') { 424 NextChar(); 425 while (IsAsciiDigit(curChar)) { 426 NextChar(); 427 } 428 } 429 if ((curChar & (~0x20)) == 'E') { 430 NextChar(); 431 if (curChar == '+' || curChar == '-') { 432 NextChar(); 433 } 434 while (IsAsciiDigit(curChar)) { 435 NextChar(); 436 } 437 throw CreateException(Res.XPath_ScientificNotation); 438 } 439 } 440 ScanString()441 private void ScanString() { 442 int startIdx = curIndex + 1; 443 int endIdx = xpathExpr.IndexOf(curChar, startIdx); 444 445 if (endIdx < 0) { 446 SetSourceIndex(xpathExpr.Length); 447 throw CreateException(Res.XPath_UnclosedString); 448 } 449 450 this.stringValue = xpathExpr.Substring(startIdx, endIdx - startIdx); 451 SetSourceIndex(endIdx + 1); 452 } 453 ScanNCName()454 private string ScanNCName() { 455 Debug.Assert(xmlCharType.IsStartNCNameSingleChar(curChar) 456 #if XML10_FIFTH_EDITION 457 || xmlCharType.IsNCNameHighSurrogateChar(curChar) 458 #endif 459 ); 460 int start = curIndex; 461 for (;;) { 462 if (xmlCharType.IsNCNameSingleChar(curChar)) { 463 NextChar(); 464 } 465 #if XML10_FIFTH_EDITION 466 else if (xmlCharType.IsNCNameSurrogateChar(PeekNextChar(), curChar)) { 467 NextChar(); 468 NextChar(); 469 } 470 #endif 471 else { 472 break; 473 } 474 } 475 return xpathExpr.Substring(start, curIndex - start); 476 } 477 PassToken(LexKind t)478 public void PassToken(LexKind t) { 479 CheckToken(t); 480 NextLex(); 481 } 482 CheckToken(LexKind t)483 public void CheckToken(LexKind t) { 484 Debug.Assert(LexKind.FirstStringable <= t); 485 if (kind != t) { 486 if (t == LexKind.Eof) { 487 throw CreateException(Res.XPath_EofExpected, RawValue); 488 } else { 489 throw CreateException(Res.XPath_TokenExpected, LexKindToString(t), RawValue); 490 } 491 } 492 } 493 494 // May be called for the following tokens: Name, String, Eof, Comma, LParens, RParens, LBracket, RBracket, RBrace LexKindToString(LexKind t)495 private string LexKindToString(LexKind t) { 496 Debug.Assert(LexKind.FirstStringable <= t); 497 498 if (LexKind.LastNonChar < t) { 499 Debug.Assert("()[].@,*/$}".IndexOf((char)t) >= 0); 500 return new String((char)t, 1); 501 } 502 503 switch (t) { 504 case LexKind.Name : return "<name>"; 505 case LexKind.String : return "<string literal>"; 506 case LexKind.Eof : return "<eof>"; 507 default: 508 Debug.Fail("Unexpected LexKind: " + t.ToString()); 509 return string.Empty; 510 } 511 } 512 CreateException(string resId, params string[] args)513 public XPathCompileException CreateException(string resId, params string[] args) { 514 return new XPathCompileException(xpathExpr, lexStart, curIndex, resId, args); 515 } 516 } 517 } 518