1package lexer 2 3import ( 4 "fmt" 5 "strings" 6 "unicode" 7 "unicode/utf8" 8) 9 10const ( 11 //XItemError is an error with the parser input 12 XItemError XItemType = "Error" 13 //XItemAbsLocPath is an absolute path 14 XItemAbsLocPath = "Absolute path" 15 //XItemAbbrAbsLocPath represents an abbreviated absolute path 16 XItemAbbrAbsLocPath = "Abbreviated absolute path" 17 //XItemAbbrRelLocPath marks the start of a path expression 18 XItemAbbrRelLocPath = "Abbreviated relative path" 19 //XItemRelLocPath represents a relative location path 20 XItemRelLocPath = "Relative path" 21 //XItemEndPath marks the end of a path 22 XItemEndPath = "End path instruction" 23 //XItemAxis marks an axis specifier of a path 24 XItemAxis = "Axis" 25 //XItemAbbrAxis marks an abbreviated axis specifier (just @ at this point) 26 XItemAbbrAxis = "Abbreviated attribute axis" 27 //XItemNCName marks a namespace name in a node test 28 XItemNCName = "Namespace" 29 //XItemQName marks the local name in an a node test 30 XItemQName = "Local name" 31 //XItemNodeType marks a node type in a node test 32 XItemNodeType = "Node type" 33 //XItemProcLit marks a processing-instruction literal 34 XItemProcLit = "processing-instruction" 35 //XItemFunction marks a function call 36 XItemFunction = "function" 37 //XItemArgument marks a function argument 38 XItemArgument = "function argument" 39 //XItemEndFunction marks the end of a function 40 XItemEndFunction = "end of function" 41 //XItemPredicate marks a predicate in an axis 42 XItemPredicate = "predicate" 43 //XItemEndPredicate marks a predicate in an axis 44 XItemEndPredicate = "end of predicate" 45 //XItemStrLit marks a string literal 46 XItemStrLit = "string literal" 47 //XItemNumLit marks a numeric literal 48 XItemNumLit = "numeric literal" 49 //XItemOperator marks an operator 50 XItemOperator = "operator" 51 //XItemVariable marks a variable reference 52 XItemVariable = "variable" 53) 54 55const ( 56 eof = -(iota + 1) 57) 58 59//XItemType is the parser token types 60type XItemType string 61 62//XItem is the token emitted from the parser 63type XItem struct { 64 Typ XItemType 65 Val string 66} 67 68type stateFn func(*Lexer) stateFn 69 70//Lexer lexes out XPath expressions 71type Lexer struct { 72 input string 73 start int 74 pos int 75 width int 76 items chan XItem 77} 78 79//Lex an XPath expresion on the io.Reader 80func Lex(xpath string) chan XItem { 81 l := &Lexer{ 82 input: xpath, 83 items: make(chan XItem), 84 } 85 go l.run() 86 return l.items 87} 88 89func (l *Lexer) run() { 90 for state := startState; state != nil; { 91 state = state(l) 92 } 93 94 if l.peek() != eof { 95 l.errorf("Malformed XPath expression") 96 } 97 98 close(l.items) 99} 100 101func (l *Lexer) emit(t XItemType) { 102 l.items <- XItem{t, l.input[l.start:l.pos]} 103 l.start = l.pos 104} 105 106func (l *Lexer) emitVal(t XItemType, val string) { 107 l.items <- XItem{t, val} 108 l.start = l.pos 109} 110 111func (l *Lexer) next() (r rune) { 112 if l.pos >= len(l.input) { 113 l.width = 0 114 return eof 115 } 116 117 r, l.width = utf8.DecodeRuneInString(l.input[l.pos:]) 118 119 l.pos += l.width 120 121 return r 122} 123 124func (l *Lexer) ignore() { 125 l.start = l.pos 126} 127 128func (l *Lexer) backup() { 129 l.pos -= l.width 130} 131 132func (l *Lexer) peek() rune { 133 r := l.next() 134 135 l.backup() 136 return r 137} 138 139func (l *Lexer) peekAt(n int) rune { 140 if n <= 1 { 141 return l.peek() 142 } 143 144 width := 0 145 var ret rune 146 147 for count := 0; count < n; count++ { 148 r, s := utf8.DecodeRuneInString(l.input[l.pos+width:]) 149 width += s 150 151 if l.pos+width > len(l.input) { 152 return eof 153 } 154 155 ret = r 156 } 157 158 return ret 159} 160 161func (l *Lexer) accept(valid string) bool { 162 if strings.ContainsRune(valid, l.next()) { 163 return true 164 } 165 166 l.backup() 167 return false 168} 169 170func (l *Lexer) acceptRun(valid string) { 171 for strings.ContainsRune(valid, l.next()) { 172 } 173 l.backup() 174} 175 176func (l *Lexer) skip(num int) { 177 for i := 0; i < num; i++ { 178 l.next() 179 } 180 l.ignore() 181} 182 183func (l *Lexer) skipWS(ig bool) { 184 for { 185 n := l.next() 186 187 if n == eof || !unicode.IsSpace(n) { 188 break 189 } 190 } 191 192 l.backup() 193 194 if ig { 195 l.ignore() 196 } 197} 198 199func (l *Lexer) errorf(format string, args ...interface{}) stateFn { 200 l.items <- XItem{ 201 XItemError, 202 fmt.Sprintf(format, args...), 203 } 204 205 return nil 206} 207 208func isElemChar(r rune) bool { 209 return string(r) != ":" && string(r) != "/" && 210 (unicode.Is(first, r) || unicode.Is(second, r) || string(r) == "*") && 211 r != eof 212} 213 214func startState(l *Lexer) stateFn { 215 l.skipWS(true) 216 217 if string(l.peek()) == "/" { 218 l.next() 219 l.ignore() 220 221 if string(l.next()) == "/" { 222 l.ignore() 223 return abbrAbsLocPathState 224 } 225 226 l.backup() 227 return absLocPathState 228 } else if string(l.peek()) == `'` || string(l.peek()) == `"` { 229 if err := getStrLit(l, XItemStrLit); err != nil { 230 return l.errorf(err.Error()) 231 } 232 233 if l.peek() != eof { 234 return startState 235 } 236 } else if getNumLit(l) { 237 l.skipWS(true) 238 if l.peek() != eof { 239 return startState 240 } 241 } else if string(l.peek()) == "$" { 242 l.next() 243 l.ignore() 244 r := l.peek() 245 for unicode.Is(first, r) || unicode.Is(second, r) { 246 l.next() 247 r = l.peek() 248 } 249 tok := l.input[l.start:l.pos] 250 if len(tok) == 0 { 251 return l.errorf("Empty variable name") 252 } 253 l.emit(XItemVariable) 254 l.skipWS(true) 255 if l.peek() != eof { 256 return startState 257 } 258 } else if st := findOperatorState(l); st != nil { 259 return st 260 } else { 261 if isElemChar(l.peek()) { 262 colons := 0 263 264 for { 265 if isElemChar(l.peek()) { 266 l.next() 267 } else if string(l.peek()) == ":" { 268 l.next() 269 colons++ 270 } else { 271 break 272 } 273 } 274 275 if string(l.peek()) == "(" && colons <= 1 { 276 tok := l.input[l.start:l.pos] 277 err := procFunc(l, tok) 278 if err != nil { 279 return l.errorf(err.Error()) 280 } 281 282 l.skipWS(true) 283 284 if string(l.peek()) == "/" { 285 l.next() 286 l.ignore() 287 288 if string(l.next()) == "/" { 289 l.ignore() 290 return abbrRelLocPathState 291 } 292 293 l.backup() 294 return relLocPathState 295 } 296 297 return startState 298 } 299 300 l.pos = l.start 301 return relLocPathState 302 } else if string(l.peek()) == "@" { 303 return relLocPathState 304 } 305 } 306 307 return nil 308} 309 310func strPeek(str string, l *Lexer) bool { 311 for i := 0; i < len(str); i++ { 312 if string(l.peekAt(i+1)) != string(str[i]) { 313 return false 314 } 315 } 316 return true 317} 318 319func findOperatorState(l *Lexer) stateFn { 320 l.skipWS(true) 321 322 switch string(l.peek()) { 323 case ">", "<", "!": 324 l.next() 325 if string(l.peek()) == "=" { 326 l.next() 327 } 328 l.emit(XItemOperator) 329 return startState 330 case "|", "+", "-", "*", "=": 331 l.next() 332 l.emit(XItemOperator) 333 return startState 334 case "(": 335 l.next() 336 l.emit(XItemOperator) 337 for state := startState; state != nil; { 338 state = state(l) 339 } 340 l.skipWS(true) 341 if string(l.next()) != ")" { 342 return l.errorf("Missing end )") 343 } 344 l.emit(XItemOperator) 345 return startState 346 } 347 348 if strPeek("and", l) { 349 l.next() 350 l.next() 351 l.next() 352 l.emit(XItemOperator) 353 return startState 354 } 355 356 if strPeek("or", l) { 357 l.next() 358 l.next() 359 l.emit(XItemOperator) 360 return startState 361 } 362 363 if strPeek("mod", l) { 364 l.next() 365 l.next() 366 l.next() 367 l.emit(XItemOperator) 368 return startState 369 } 370 371 if strPeek("div", l) { 372 l.next() 373 l.next() 374 l.next() 375 l.emit(XItemOperator) 376 return startState 377 } 378 379 return nil 380} 381 382func getStrLit(l *Lexer, tok XItemType) error { 383 q := l.next() 384 var r rune 385 386 l.ignore() 387 388 for r != q { 389 r = l.next() 390 if r == eof { 391 return fmt.Errorf("Unexpected end of string literal.") 392 } 393 } 394 395 l.backup() 396 l.emit(tok) 397 l.next() 398 l.ignore() 399 400 return nil 401} 402 403func getNumLit(l *Lexer) bool { 404 const dig = "0123456789" 405 l.accept("-") 406 start := l.pos 407 l.acceptRun(dig) 408 409 if l.pos == start { 410 return false 411 } 412 413 if l.accept(".") { 414 l.acceptRun(dig) 415 } 416 417 l.emit(XItemNumLit) 418 return true 419} 420