1# -*- coding: utf-8 -*- 2# cython: language_level=3, always_allow_keywords=True 3 4## Copyright 1999-2018 by LivingLogic AG, Bayreuth/Germany 5## Copyright 1999-2018 by Walter Dörwald 6## 7## All Rights Reserved 8## 9## See ll/xist/__init__.py for the license 10 11 12""" 13This module contains XFind selectors and related classes and functions. 14 15A selector specifies a condition that a node in an XIST tree must satisfy to 16match the selector. For example the method :meth:`Node.walk` will only output 17nodes that match the specified selector. 18 19Selectors can be combined with various operations and form a language comparable 20to XPath__ but implemented as Python expressions. 21 22__ http://www.w3.org/TR/xpath 23""" 24 25 26import builtins, collections 27 28from ll import misc 29from ll.xist import xsc 30 31 32__docformat__ = "reStructuredText" 33 34 35### 36### Function for filtering a :class:`xsc.Cursor` iterator against a :class:`Selector`. 37### 38 39def filter(iter, *selectors): 40 """ 41 Filter an iterator over :class:`xsc.Cursor` objects against a 42 :class:`Selector` object. 43 44 Example:: 45 46 >>> from ll.xist import xsc, parse, xfind 47 >>> from ll.xist.ns import xml, html, chars 48 >>> doc = parse.tree( 49 ... parse.URL("https://www.python.org/"), 50 ... parse.Tidy(), 51 ... parse.NS(html), 52 ... parse.Node(pool=xsc.Pool(xml, html, chars)) 53 ... ) 54 >>> [c.node.string() for c in xfind.filter(doc.walk(), html.b, html.title)] 55 [ 56 '<title>Welcome to Python.org</title>', 57 '<b>Web Programming</b>', 58 '<b>GUI Development</b>', 59 '<b>Scientific and Numeric</b>', 60 '<b>Software Development</b>', 61 '<b>System Administration</b>' 62 ] 63 """ 64 sel = selector(*selectors) 65 for cursor in iter: 66 if cursor.path in sel: 67 yield cursor 68 69 70### 71### Function for creating a :class:`Selector` object. 72### 73 74def selector(*objs): 75 """ 76 Create a :class:`Selector` object from :obj:`objs`. 77 78 If :obj:`objs` is empty (i.e. :func:`selector` is called without arguments) 79 ``any`` is returned (which matches every node). 80 81 If more than one argument is passed (or the argument is a tuple), an 82 :class:`OrCombinator` is returned. 83 84 Otherwise the following steps are taken for the single argument ``obj``: 85 86 * if ``obj`` already is a :class:`Selector` object it is returned unchanged; 87 88 * if ``obj`` is a :class:`Node` subclass, an :class:`IsInstanceSelector` 89 is returned (which matches if the node is an instance of this class); 90 91 * if ``obj`` is a :class:`Node` instance, an :class:`IsSelector` is returned 92 (which matches only ``obj``); 93 94 * if ``obj`` is callable a :class:`CallableSelector` is returned 95 (where matching is done by calling ``obj``); 96 97 * if ``obj`` is ``None`` ``any`` will be returned; 98 99 * otherwise :func:`selector` will raise a :exc:`TypeError`. 100 """ 101 if not objs: 102 return any 103 if len(objs) == 1: 104 obj = objs[0] 105 if isinstance(obj, Selector): 106 return obj 107 if isinstance(obj, xsc._Node_Meta): 108 return IsInstanceSelector(obj) 109 elif isinstance(obj, tuple): 110 return selector(*obj) 111 elif isinstance(obj, xsc.Node): 112 return IsSelector(obj) 113 elif isinstance(obj, collections.Callable): 114 return CallableSelector(obj) 115 elif obj is None: 116 return any 117 else: 118 raise TypeError(f"can't convert {obj!r} to selector") 119 elif all(isinstance(sel, type) for sel in objs): 120 return IsInstanceSelector(*objs) 121 return OrCombinator(*objs) 122 123 124### 125### Selectors for the :meth:`walk` method. 126### 127 128class Selector: 129 """ 130 A selector specifies a condition that a node in an XIST tree must satisfy 131 to match the selector. 132 133 Whether a node matches the selector can be specified by overwriting the 134 :meth:`__contains__` method. Selectors can be combined with various 135 operations (see methods below). 136 """ 137 138 @misc.notimplemented 139 def __contains__(self, path): 140 """ 141 Return whether :obj:`path` (which is a list of XIST nodes from the root 142 of the tree to the node in question) matches the selector. 143 """ 144 145 def __truediv__(self, other): 146 """ 147 Create a :class:`ChildCombinator` with :obj:`self` as the left hand 148 selector and :obj:`other` as the right hand selector. 149 """ 150 return ChildCombinator(self, selector(other)) 151 152 def __floordiv__(self, other): 153 """ 154 Create a :class:`DescendantCombinator` with :obj:`self` as the left hand 155 selector and :obj:`other` as the right hand selector. 156 """ 157 return DescendantCombinator(self, selector(other)) 158 159 def __mul__(self, other): 160 """ 161 Create an :class:`AdjacentSiblingCombinator` with :obj:`self` as the left 162 hand selector and :obj:`other` as the right hand selector. 163 """ 164 return AdjacentSiblingCombinator(self, selector(other)) 165 166 def __pow__(self, other): 167 """ 168 Create a :class:`GeneralSiblingCombinator` with :obj:`self` as the left 169 hand selector and :obj:`other` as the right hand selector. 170 """ 171 return GeneralSiblingCombinator(self, selector(other)) 172 173 def __and__(self, other): 174 """ 175 Create an :class:`AndCombinator` from :obj:`self` and :obj:`other`. 176 """ 177 return AndCombinator(self, selector(other)) 178 179 def __or__(self, other): 180 """ 181 Create an :class:`OrCombinator` from :obj:`self` and :obj:`other`. 182 """ 183 return OrCombinator(self, selector(other)) 184 185 def __invert__(self): 186 """ 187 Create a :class:`NotCombinator` inverting :obj:`self`. 188 """ 189 return NotCombinator(self) 190 191 192 193class AnySelector(Selector): 194 """ 195 Selector that selects all nodes. 196 197 An instance of this class named ``any`` is created as a module global, i.e. 198 you can use ``xfind.any``. 199 """ 200 201 def __contains__(self, path): 202 return True 203 204 def __and__(self, other): 205 return selector(other) 206 207 def __or__(self, other): 208 return self 209 210 211any = AnySelector() 212 213 214class IsInstanceSelector(Selector): 215 """ 216 Selector that selects all nodes that are instances of the specified type. 217 You can either create an :class:`IsInstanceSelector` object directly 218 or simply pass a class to a function that expects a selector (this class 219 will be automatically wrapped in an :class:`IsInstanceSelector`):: 220 221 >>> from ll.xist import xsc, parse, xfind 222 >>> from ll.xist.ns import xml, html, chars 223 >>> doc = parse.tree( 224 ... parse.URL("https://www.python.org/"), 225 ... parse.Tidy(), 226 ... parse.NS(html), 227 ... parse.Node(pool=xsc.Pool(xml, html, chars)) 228 ... ) 229 >>> for node in doc.walknodes(html.a): 230 ... print(node.attrs.href, node.attrs.title) 231 ... 232 https://www.python.org/#content Skip to content 233 https://www.python.org/#python-network 234 https://www.python.org/ The Python Programming Language 235 https://www.python.org/psf-landing/ The Python Software Foundation 236 ... 237 """ 238 def __init__(self, *types): 239 self.types = types 240 241 def __contains__(self, path): 242 return isinstance(path[-1], self.types) 243 244 def __or__(self, other): 245 # If ``other`` is a type check too, combine ``self`` and ``other`` into one :class:`IsInstanceSelector` object 246 if isinstance(other, xsc._Node_Meta): 247 return IsInstanceSelector(*(self.types + (other,))) 248 elif isinstance(other, IsInstanceSelector): 249 return IsInstanceSelector(*(self.types+other.types)) 250 return Selector.__or__(self, other) 251 252 def __getitem__(self, index): 253 """ 254 Return an :class:`nthoftype` selector that uses :obj:`index` as the 255 index and ``self.types`` as the types. 256 """ 257 return nthoftype(index, *self.types) 258 259 def __str__(self): 260 if len(self.types) == 1: 261 return f"{self.types[0].__module__}.{self.types[0].__name__}" 262 else: 263 types = " | ".join(f"{type.__module__}.{type.__name__}" for type in self.types) 264 return f"({types})" 265 266 267class element(Selector): 268 """ 269 Selector that selects all elements that have a specified namespace name and 270 element name:: 271 272 >>> from ll.xist import xsc, parse, xfind 273 >>> from ll.xist.ns import xml, html, chars 274 >>> doc = parse.tree( 275 ... parse.URL("https://www.python.org/"), 276 ... parse.Tidy(), 277 ... parse.NS(html), 278 ... parse.Node(pool=xsc.Pool(xml, html, chars)) 279 ... ) 280 >>> for node in doc.walknodes(xfind.element(html, "img")): 281 ... print(node.string()) 282 ... 283 <img alt="python™" class="python-logo" src="https://www.python.org/static/img/python-logo.png" /> 284 """ 285 def __init__(self, xmlns, xmlname): 286 self.xmlns = xsc.nsname(xmlns) 287 self.xmlname = xmlname 288 289 def __contains__(self, path): 290 node = path[-1] 291 return isinstance(node, xsc.Element) and node.xmlns == self.xmlns and node.xmlname == self.xmlname 292 293 def __str__(self): 294 return f"{self.__class__.__qualname__}({self.name!r}, {self.xmlns!r})" 295 296 297class procinst(Selector): 298 """ 299 Selector that selects all processing instructions that have a specified name. 300 """ 301 def __init__(self, xmlname): 302 self.xmlname = xmlname 303 304 def __contains__(self, path): 305 node = path[-1] 306 return isinstance(node, xsc.ProcInst) and node.xmlname == self.xmlname 307 308 def __str__(self): 309 return f"{self.__class__.__qualname__}({self.name!r})" 310 311 312class entity(Selector): 313 """ 314 Selector that selects all entities that have a specified name. 315 """ 316 def __init__(self, xmlname): 317 self.xmlname = xmlname 318 319 def __contains__(self, path): 320 node = path[-1] 321 return isinstance(node, xsc.Entity) and node.xmlname == self.xmlname 322 323 def __str__(self): 324 return f"{self.__class__.__qualname__}({self.name!r})" 325 326 327class IsSelector(Selector): 328 """ 329 Selector that selects one specific node in the tree. This can be combined 330 with other selectors via :class:`ChildCombinator` or 331 :class:`DescendantCombinator` selectors to select children of this specific 332 node. You can either create an :class:`IsSelector` directly or simply pass 333 a node to a function that expects a selector:: 334 335 >>> from ll.xist import xsc, parse 336 >>> from ll.xist.ns import xml, html, chars 337 >>> doc = parse.tree( 338 ... parse.URL("https://www.python.org/"), 339 ... parse.Tidy(), 340 ... parse.NS(html), 341 ... parse.Node(pool=xsc.Pool(xml, html, chars)) 342 ... ) 343 >>> for node in doc.walknodes(doc[0]/xsc.Element): 344 ... print(repr(node)) 345 ... 346 <element ll.xist.ns.html.head xmlns='http://www.w3.org/1999/xhtml' (89 children/no attrs) location='https://www.python.org/:?:?' at 0x104ad7630> 347 <element ll.xist.ns.html.body xmlns='http://www.w3.org/1999/xhtml' (14 children/2 attrs) location='https://www.python.org/:?:?' at 0x104cc1f28> 348 """ 349 def __init__(self, node): 350 self.node = node 351 352 def __contains__(self, path): 353 return path[-1] is self.node 354 355 def __str__(self): 356 return f"{self.__class__.__qualname__}({self.node!r})" 357 358 359class IsRootSelector(Selector): 360 """ 361 Selector that selects the node that is the root of the traversal. 362 363 An instance of this class named ``isroot`` is created as a module global, 364 i.e. you can use ``xfind.isroot``. 365 """ 366 def __contains__(self, path): 367 return len(path) == 1 368 369 370isroot = IsRootSelector() 371 372 373class IsEmptySelector(Selector): 374 """ 375 Selector that selects all empty elements or fragments. 376 377 An instance of this class named ``empty`` is created as a module global, 378 i.e. you can use ``xfind.empty``:: 379 380 >>> from ll.xist import xsc, parse, xfind 381 >>> from ll.xist.ns import xml, html, chars 382 >>> doc = parse.tree( 383 ... parse.URL("https://www.python.org/"), 384 ... parse.Tidy(), 385 ... parse.NS(html), 386 ... parse.Node(pool=xsc.Pool(xml, html, chars)) 387 ... ) 388 >>> for node in doc.walknodes(xfind.empty): 389 ... print(node.string()) 390 ... 391 <meta charset="utf-8" /> 392 <meta http-equiv="X-UA-Compatible" content="IE=edge" /> 393 <link href="https://ajax.googleapis.com/" rel="prefetch" /> 394 <meta name="application-name" content="Python.org" /> 395 ... 396 """ 397 398 def __contains__(self, path): 399 node = path[-1] 400 if isinstance(node, (xsc.Element, xsc.Frag)): 401 return len(node) == 0 402 return False 403 404 405empty = IsEmptySelector() 406 407 408class OnlyChildSelector(Selector): 409 """ 410 Selector that selects all nodes that are the only child of their parents. 411 412 An instance of this class named ``onlychild`` is created as a module global, 413 i.e. you can use ``xfind.onlychild``:: 414 415 >>> from ll.xist import xsc, parse, xfind 416 >>> from ll.xist.ns import xml, html, chars 417 >>> doc = parse.tree( 418 ... parse.URL("https://www.python.org/"), 419 ... parse.Tidy(), 420 ... parse.NS(html), 421 ... parse.Node(pool=xsc.Pool(xml, html, chars)) 422 ... ) 423 >>> for node in doc.walknodes(xfind.onlychild & html.a): 424 ... print(node.string()) 425 ... 426 <a class="text-shrink" href="javascript:;" title="Make Text Smaller">Smaller</a> 427 <a class="text-grow" href="javascript:;" title="Make Text Larger">Larger</a> 428 <a class="text-reset" href="javascript:;" title="Reset any font size changes I have made">Reset</a> 429 <a href="http://plus.google.com/+Python"><span aria-hidden="true" class="icon-google-plus"></span>Google+</a> 430 ... 431 """ 432 433 def __contains__(self, path): 434 if len(path) >= 2: 435 parent = path[-2] 436 if isinstance(parent, (xsc.Frag, xsc.Element)): 437 return len(parent) == 1 and parent[0] is path[-1] 438 return False 439 440 def __str__(self): 441 return "onlychild" 442 443 444onlychild = OnlyChildSelector() 445 446 447class OnlyOfTypeSelector(Selector): 448 """ 449 Selector that selects all nodes that are the only nodes of their type among 450 their siblings. 451 452 An instance of this class named ``onlyoftype`` is created as a module global, 453 i.e. you can use ``xfind.onlyoftype``:: 454 455 >>> from ll.xist import xsc, parse, xfind 456 >>> from ll.xist.ns import xml, html, chars 457 >>> doc = parse.tree( 458 ... parse.URL("https://www.python.org/"), 459 ... parse.Tidy(), 460 ... parse.NS(html), 461 ... parse.Node(pool=xsc.Pool(xml, html, chars)) 462 ... ) 463 >>> for node in doc.walknodes(xfind.onlyoftype & xsc.Element): 464 ... print(repr(node)) 465 ... 466 <element ll.xist.ns.html.html xmlns='http://www.w3.org/1999/xhtml' (7 children/3 attrs) location='https://www.python.org/:?:?' at 0x108858d30> 467 <element ll.xist.ns.html.head xmlns='http://www.w3.org/1999/xhtml' (89 children/no attrs) location='https://www.python.org/:?:?' at 0x108858630> 468 <element ll.xist.ns.html.title xmlns='http://www.w3.org/1999/xhtml' (1 child/no attrs) location='https://www.python.org/:?:?' at 0x108c547b8> 469 <element ll.xist.ns.html.body xmlns='http://www.w3.org/1999/xhtml' (14 children/2 attrs) location='https://www.python.org/:?:?' at 0x108c54eb8> 470 ... 471 """ 472 473 def __contains__(self, path): 474 if len(path) >= 2: 475 node = path[-1] 476 parent = path[-2] 477 if isinstance(parent, (xsc.Frag, xsc.Element)): 478 for child in parent: 479 if isinstance(child, node.__class__): 480 if child is not node: 481 return False 482 return True 483 return False 484 485 def __str__(self): 486 return "onlyoftype" 487 488 489onlyoftype = OnlyOfTypeSelector() 490 491 492class hasattr(Selector): 493 """ 494 Selector that selects all element nodes that have an attribute with one of 495 the specified names. (Names can be strings, (attribute name, namespace name) 496 tuples or attribute classes or instances):: 497 498 >>> from ll.xist import xsc, parse, xfind 499 >>> from ll.xist.ns import xml, html, chars 500 >>> doc = parse.tree( 501 ... parse.URL("https://www.python.org/"), 502 ... parse.Tidy(), 503 ... parse.NS(html), 504 ... parse.Node(pool=xsc.Pool(xml, html, chars)) 505 ... ) 506 >>> for node in doc.walknodes(xfind.hasattr("id")): 507 ... print(node.xmlname, node.attrs.id) 508 ... 509 body homepage 510 div touchnav-wrapper 511 div top 512 a close-python-network 513 ... 514 """ 515 516 def __init__(self, *attrnames): 517 self.attrnames = attrnames 518 519 def __contains__(self, path): 520 node = path[-1] 521 if isinstance(node, xsc.Element): 522 for attrname in self.attrnames: 523 if attrname in node.attrs: 524 return True 525 return False 526 527 def __str__(self): 528 attrnames = ", ".join(repr(attrname) for attrname in self.attrnames) 529 return f"{self.__class__.__qualname__}({attrname})" 530 531 532class attrhasvalue(Selector): 533 """ 534 Selector that selects all element nodes where an attribute with the specified 535 name has one of the specified values. (Names can be strings, 536 (attribute name, namespace name) tuples or attribute classes or instances). 537 Note that "fancy" attributes (i.e. those containing non-text) will not be 538 considered:: 539 540 >>> from ll.xist import xsc, parse, xfind 541 >>> from ll.xist.ns import xml, html, chars 542 >>> doc = parse.tree( 543 ... parse.URL("https://www.python.org/"), 544 ... parse.Tidy(), 545 ... parse.NS(html), 546 ... parse.Node(pool=xsc.Pool(xml, html, chars)) 547 ... ) 548 >>> for node in doc.walknodes(xfind.attrhasvalue("rel", "stylesheet")): 549 ... print(node.attrs.href) 550 ... 551 https://www.python.org/static/stylesheets/style.css 552 https://www.python.org/static/stylesheets/mq.css 553 """ 554 555 def __init__(self, attrname, *attrvalues): 556 self.attrname = attrname 557 if not attrvalues: 558 raise ValueError("need at least one attribute value") 559 self.attrvalues = attrvalues 560 561 def __contains__(self, path): 562 node = path[-1] 563 if isinstance(node, xsc.Element): 564 attr = node.attrs.get(self.attrname) 565 if not attr.isfancy(): # if there are PIs, say no 566 return str(attr) in self.attrvalues 567 return False 568 569 def __str__(self): 570 attrvalues = repr(self.attrvalues)[1:-1] 571 return f"{self.__class__.__qualname__}({self.attrname!r}, {attrvalues})" 572 573 574class attrcontains(Selector): 575 """ 576 Selector that selects all element nodes where an attribute with the specified 577 name contains one of the specified substrings in its value. (Names can be 578 strings, (attribute name, namespace name) tuples or attribute classes or 579 instances). Note that "fancy" attributes (i.e. those containing non-text) 580 will not be considered:: 581 582 >>> from ll.xist import xsc, parse, xfind 583 >>> from ll.xist.ns import xml, html, chars 584 >>> doc = parse.tree( 585 ... parse.URL("https://www.python.org/"), 586 ... parse.Tidy(), 587 ... parse.NS(html), 588 ... parse.Node(pool=xsc.Pool(xml, html, chars)) 589 ... ) 590 >>> for node in doc.walknodes(xfind.attrcontains("rel", "stylesheet")): 591 ... print(node.attrs.rel, node.attrs.href) 592 ... 593 stylesheet https://www.python.org/static/stylesheets/style.css 594 stylesheet https://www.python.org/static/stylesheets/mq.css 595 """ 596 597 def __init__(self, attrname, *attrvalues): 598 self.attrname = attrname 599 if not attrvalues: 600 raise ValueError("need at least one attribute value") 601 self.attrvalues = attrvalues 602 603 def __contains__(self, path): 604 node = path[-1] 605 if isinstance(node, xsc.Element): 606 attr = node.attrs.get(self.attrname) 607 if not attr.isfancy(): # if there are PIs, say no 608 return builtins.any(attrvalue in str(attr) for attrvalue in self.attrvalues) 609 return False 610 611 def __str__(self): 612 attrvalues = repr(self.attrvalues)[1:-1] 613 return f"{self.__class__.__qualname__}({self.attrname!r}, {attrvalues})" 614 615 616class attrstartswith(Selector): 617 """ 618 Selector that selects all element nodes where an attribute with the specified 619 name starts with any of the specified strings. (Names can be strings, 620 (attribute name, namespace name) tuples or attribute classes or instances). 621 Note that "fancy" attributes (i.e. those containing non-text) will not be 622 considered:: 623 624 >>> from ll.xist import xsc, parse, xfind 625 >>> from ll.xist.ns import xml, html, chars 626 >>> doc = parse.tree( 627 ... parse.URL("https://www.python.org/"), 628 ... parse.Tidy(), 629 ... parse.NS(html), 630 ... parse.Node(pool=xsc.Pool(xml, html, chars)) 631 ... ) 632 >>> for node in doc.walknodes(xfind.attrstartswith("class", "icon-")): 633 ... print(node.string()) 634 ... 635 <span aria-hidden="true" class="icon-arrow-down"><span>▼</span></span> 636 <span aria-hidden="true" class="icon-arrow-up"><span>▲</span></span> 637 <span aria-hidden="true" class="icon-search"></span> 638 <span aria-hidden="true" class="icon-google-plus"></span> 639 ... 640 """ 641 642 def __init__(self, attrname, *attrvalues): 643 self.attrname = attrname 644 if not attrvalues: 645 raise ValueError("need at least one attribute value") 646 self.attrvalues = attrvalues 647 648 def __contains__(self, path): 649 node = path[-1] 650 if isinstance(node, xsc.Element): 651 attr = node.attrs.get(self.attrname) 652 if not attr.isfancy(): # if there are PIs, say no 653 return builtins.any(str(attr).startswith(attrvalue) for attrvalue in self.attrvalues) 654 return False 655 656 def __str__(self): 657 attrvalues = repr(self.attrvalues)[1:-1] 658 return f"{self.__class__.__qualname__}({self.attrname!r}, {attrvalues})" 659 660 661class attrendswith(Selector): 662 """ 663 Selector that selects all element nodes where an attribute with the specified 664 name ends with one of the specified strings. (Names can be strings, 665 (attribute name, namespace name) tuples or attribute classes or instances). 666 Note that "fancy" attributes (i.e. those containing non-text) will not be 667 considered:: 668 669 >>> from ll.xist import xsc, parse, xfind 670 >>> from ll.xist.ns import xml, html, chars 671 >>> doc = parse.tree( 672 ... parse.URL("https://www.python.org/"), 673 ... parse.Tidy(), 674 ... parse.NS(html), 675 ... parse.Node(pool=xsc.Pool(xml, html, chars)) 676 ... ) 677 >>> for node in doc.walknodes(xfind.attrendswith("href", ".css")): 678 ... print(node.attrs.href) 679 ... 680 https://www.python.org/static/stylesheets/style.css 681 https://www.python.org/static/stylesheets/mq.css 682 """ 683 684 def __init__(self, attrname, *attrvalues): 685 self.attrname = attrname 686 if not attrvalues: 687 raise ValueError("need at least one attribute value") 688 self.attrvalues = attrvalues 689 690 def __contains__(self, path): 691 node = path[-1] 692 if isinstance(node, xsc.Element): 693 attr = node.attrs.get(self.attrname) 694 if not attr.isfancy(): # if there are PIs, say no 695 return builtins.any(str(attr).endswith(attrvalue) for attrvalue in self.attrvalues) 696 return False 697 698 def __str__(self): 699 attrvalues = repr(self.attrvalues)[1:-1] 700 return f"{self.__class__.__qualname__}({self.attrname!r}, {attrvalues})" 701 702 703class hasid(Selector): 704 """ 705 Selector that selects all element nodes where the ``id`` attribute has one 706 if the specified values:: 707 708 >>> from ll.xist import xsc, parse, xfind 709 >>> from ll.xist.ns import xml, html, chars 710 >>> doc = parse.tree( 711 ... parse.URL("https://www.python.org/"), 712 ... parse.Tidy(), 713 ... parse.NS(html), 714 ... parse.Node(pool=xsc.Pool(xml, html, chars)) 715 ... ) 716 >>> for node in doc.walknodes(xfind.hasid("id-search-field")): 717 ... print(node.string()) 718 ... 719 <input class="search-field" id="id-search-field" name="q" placeholder="Search" role="textbox" tabindex="1" type="search" /> 720 """ 721 722 def __init__(self, *ids): 723 if not ids: 724 raise ValueError("need at least one id") 725 self.ids = ids 726 727 def __contains__(self, path): 728 node = path[-1] 729 if isinstance(node, xsc.Element): 730 attr = node.attrs.get("id") 731 if not attr.isfancy(): 732 return str(attr) in self.ids 733 return False 734 735 def __str__(self): 736 ids = repr(self.ids)[1:-1] 737 return f"{self.__class__.__qualname__}({ids})" 738 739 740class hasclass(Selector): 741 """ 742 Selector that selects all element nodes where the ``class`` attribute contains 743 one of the specified values:: 744 745 >>> from ll.xist import xsc, parse, xfind 746 >>> from ll.xist.ns import xml, html, chars 747 >>> doc = parse.tree( 748 ... parse.URL("https://www.python.org/"), 749 ... parse.Tidy(), 750 ... parse.NS(html), 751 ... parse.Node(pool=xsc.Pool(xml, html, chars)) 752 ... ) 753 >>> for node in doc.walknodes(xfind.hasclass("tier-1")/html.a): 754 ... print(node.string()) 755 ... 756 A A 757 Socialize 758 Sign In 759 About 760 Downloads 761 ... 762 """ 763 764 def __init__(self, *classnames): 765 if not classnames: 766 raise ValueError("need at least one classname") 767 self.classnames = classnames 768 769 def __contains__(self, path): 770 node = path[-1] 771 if isinstance(node, xsc.Element): 772 attr = node.attrs.get("class") 773 if not attr.isfancy(): 774 return builtins.any(classname in str(attr).split() for classname in self.classnames) 775 return False 776 777 def __str__(self): 778 classnames = repr(self.classnames)[1:-1] 779 return f"{self.__class__.__qualname__}({classnames})" 780 781 782class InAttrSelector(Selector): 783 """ 784 Selector that selects all attribute nodes and nodes inside of attributes:: 785 786 >>> from ll.xist import xsc, parse, xfind 787 >>> from ll.xist.ns import xml, html, chars 788 >>> doc = parse.tree( 789 ... parse.URL("https://www.python.org/"), 790 ... parse.Tidy(), 791 ... parse.NS(html), 792 ... parse.Node(pool=xsc.Pool(xml, html, chars)) 793 ... ) 794 >>> for path in doc.walkpaths(xfind.inattr & xsc.Text, enterattrs=True, enterattr=True): 795 ... print(path[-3].xmlname, path[-2].xmlname, path[-1].string()) 796 ... 797 html class no-js 798 html dir ltr 799 html lang en 800 meta charset utf-8 801 meta content IE=edge 802 meta http-equiv X-UA-Compatible 803 ... 804 """ 805 def __contains__(self, path): 806 return builtins.any(isinstance(node, xsc.Attr) for node in path) 807 808 def __str__(self): 809 return "inattr" 810 811 812inattr = InAttrSelector() 813 814 815class Combinator(Selector): 816 """ 817 A :class:`Combinator` is a selector that transforms one or combines two or 818 more other selectors in a certain way. 819 """ 820 821 822class BinaryCombinator(Combinator): 823 """ 824 A :class:`BinaryCombinator` is a combinator that combines two selector: 825 the left hand selector and the right hand selector. 826 """ 827 symbol = None 828 829 def __init__(self, left, right): 830 self.left = left 831 self.right = right 832 833 def __str__(self): 834 left = str(self.left) 835 if isinstance(self.left, Combinator) and not isinstance(self.left, self.__class__): 836 left = f"({left})" 837 right = str(self.right) 838 if isinstance(self.right, Combinator) and not isinstance(self.right, self.__class__): 839 right = f"({right})" 840 return f"{left}{self.symbol}{right}" 841 842 843class ChildCombinator(BinaryCombinator): 844 """ 845 A :class:`ChildCombinator` is a :class:`BinaryCombinator`. To match the 846 :class:`ChildCombinator` the node must match the right hand selector and 847 its immediate parent must match the left hand selector (i.e. it works 848 similar to the ``>`` combinator in CSS or the ``/`` combinator in XPath). 849 850 :class:`ChildCombinator` objects can be created via the division operator 851 (``/``):: 852 853 >>> from ll.xist import xsc, parse 854 >>> from ll.xist.ns import xml, html, chars 855 >>> doc = parse.tree( 856 ... parse.URL("https://www.python.org/"), 857 ... parse.Tidy(), 858 ... parse.NS(html), 859 ... parse.Node(pool=xsc.Pool(xml, html, chars)) 860 ... ) 861 >>> for node in doc.walknodes(html.a/html.img): 862 ... print(node.string()) 863 ... 864 <img alt="python™" class="python-logo" src="https://www.python.org/static/img/python-logo.png" /> 865 """ 866 def __contains__(self, path): 867 if len(path) > 1 and path in self.right: 868 return path[:-1] in self.left 869 return False 870 871 symbol = " / " 872 873 874class DescendantCombinator(BinaryCombinator): 875 """ 876 A :class:`DescendantCombinator` is a :class:`BinaryCombinator`. To match the 877 :class:`DescendantCombinator` the node must match the right hand selector 878 and any of its ancestor nodes must match the left hand selector (i.e. it 879 works similar to the descendant combinator in CSS or the ``//`` combinator 880 in XPath). 881 882 :class:`DescendantCombinator` objects can be created via the floor division 883 operator (``//``):: 884 885 >>> from ll.xist import xsc, parse 886 >>> from ll.xist.ns import xml, html, chars 887 >>> doc = parse.tree( 888 ... parse.URL("https://www.python.org/"), 889 ... parse.Tidy(), 890 ... parse.NS(html), 891 ... parse.Node(pool=xsc.Pool(xml, html, chars)) 892 ... ) 893 >>> for node in doc.walknodes(html.div//html.img): 894 ... print(node.string()) 895 ... 896 <img alt="python™" class="python-logo" src="https://www.python.org/static/img/python-logo.png" /> 897 """ 898 def __contains__(self, path): 899 if path in self.right: 900 while len(path) > 1: 901 path = path[:-1] 902 if path in self.left: 903 return True 904 return False 905 906 symbol = " // " 907 908 909class AdjacentSiblingCombinator(BinaryCombinator): 910 """ 911 A :class:`AdjacentSiblingCombinator` is a :class:`BinaryCombinator`. 912 To match the :class:`AdjacentSiblingCombinator` the node must match the 913 right hand selector and the immediately preceding sibling must match the 914 left hand selector. 915 916 :class:`AdjacentSiblingCombinator` objects can be created via the 917 multiplication operator (``*``). The following example outputs all 918 :class:`span` elements that immediately follow a :class:`form` element:: 919 920 >>> from ll.xist import xsc, parse, xfind 921 >>> from ll.xist.ns import xml, html, chars 922 >>> doc = parse.tree( 923 ... parse.URL("https://www.python.org/"), 924 ... parse.Tidy(), 925 ... parse.NS(html), 926 ... parse.Node(pool=xsc.Pool(xml, html, chars)) 927 ... ) 928 >>> for node in doc.walknodes(html.form*html.span): 929 ... print(node.string()) 930 ... 931 <span class="breaker"></span> 932 """ 933 934 def __contains__(self, path): 935 if len(path) > 1 and path in self.right: 936 # Find sibling 937 node = path[-1] 938 sibling = None 939 for child in path[-2]: 940 if child is node: 941 break 942 sibling = child 943 if sibling is not None: 944 return path[:-1]+[sibling] in self.left 945 return False 946 947 symbol = " * " 948 949 950class GeneralSiblingCombinator(BinaryCombinator): 951 """ 952 A :class:`GeneralSiblingCombinator` is a :class:`BinaryCombinator`. 953 To match the :class:`GeneralSiblingCombinator` the node must match the 954 right hand selector and any of the preceding siblings must match the left 955 hand selector. 956 957 :class:`AdjacentSiblingCombinator` objects can be created via the 958 exponentiation operator (``**``). The following example outputs all 959 :class:`meta` element that come after the :class:`link` elements:: 960 961 >>> from ll.xist import xsc, parse, xfind 962 >>> from ll.xist.ns import xml, html, chars 963 >>> doc = parse.tree( 964 ... parse.URL("https://www.python.org/"), 965 ... parse.Tidy(), 966 ... parse.NS(html), 967 ... parse.Node(pool=xsc.Pool(xml, html, chars)) 968 ... ) 969 >>> for node in doc.walknodes(html.link**html.meta): 970 ... print(node.string()) 971 ... 972 <meta name="application-name" content="Python.org" /> 973 <meta name="msapplication-tooltip" content="The official home of the Python Programming Language" /> 974 <meta name="apple-mobile-web-app-title" content="Python.org" /> 975 <meta name="apple-mobile-web-app-capable" content="yes" /> 976 <meta name="apple-mobile-web-app-status-bar-style" content="black" /> 977 ... 978 """ 979 980 def __contains__(self, path): 981 if len(path) > 1 and path in self.right: 982 node = path[-1] 983 for child in path[-2]: 984 if child is node: # no previous siblings 985 return False 986 if path[:-1]+[child] in self.left: 987 return True 988 return False 989 990 symbol = " ** " 991 992 993class ChainedCombinator(Combinator): 994 """ 995 A :class:`ChainedCombinator` combines any number of other selectors. 996 """ 997 998 symbol = None 999 1000 def __init__(self, *selectors): 1001 self.selectors = tuple(selector(sel) for sel in selectors) 1002 1003 def __str__(self): 1004 v = [] 1005 for sel in self.selectors: 1006 if isinstance(sel, Combinator) and not isinstance(sel, self.__class__): 1007 s = f"({sel})" 1008 else: 1009 s = str(sel) 1010 v.append(s) 1011 return self.symbol.join(v) 1012 1013 1014class OrCombinator(ChainedCombinator): 1015 """ 1016 An :class:`OrCombinator` is a :class:`ChainedCombinator` where the node must 1017 match at least one of the selectors to match the :class:`OrCombinator`. An 1018 :class:`OrCombinator` can be created with the binary or operator (``|``):: 1019 1020 >>> from ll.xist import xsc, parse, xfind 1021 >>> from ll.xist.ns import xml, html, chars 1022 >>> doc = parse.tree( 1023 ... parse.URL("https://www.python.org/"), 1024 ... parse.Tidy(), 1025 ... parse.NS(html), 1026 ... parse.Node(pool=xsc.Pool(xml, html, chars)) 1027 ... ) 1028 >>> for node in doc.walknodes(xfind.hasattr("href") | xfind.hasattr("src")): 1029 ... print(node.attrs.href if "href" in node.Attrs else node.attrs.src) 1030 ... 1031 https://ajax.googleapis.com/ 1032 https://www.python.org/static/js/libs/modernizr.js 1033 https://www.python.org/static/stylesheets/style.css 1034 https://www.python.org/static/stylesheets/mq.css 1035 https://www.python.org/static/favicon.ico 1036 ... 1037 """ 1038 1039 def __contains__(self, path): 1040 return builtins.any(path in sel for sel in self.selectors) 1041 1042 symbol = " | " 1043 1044 def __or__(self, other): 1045 return OrCombinator(*(self.selectors + (selector(other),))) 1046 1047 1048class AndCombinator(ChainedCombinator): 1049 """ 1050 An :class:`AndCombinator` is a :class:`ChainedCombinator` where the node 1051 must match all of the combined selectors to match the :class:`AndCombinator`. 1052 An :class:`AndCombinator` can be created with the binary and operator 1053 (``&``):: 1054 1055 >>> from ll.xist import xsc, parse, xfind 1056 >>> from ll.xist.ns import xml, html, chars 1057 >>> doc = parse.tree( 1058 ... parse.URL("https://www.python.org/"), 1059 ... parse.Tidy(), 1060 ... parse.NS(html), 1061 ... parse.Node(pool=xsc.Pool(xml, html, chars)) 1062 ... ) 1063 >>> for node in doc.walknodes(html.input & xfind.hasattr("id")): 1064 ... print(node.string()) 1065 ... 1066 <input class="search-field" id="id-search-field" name="q" placeholder="Search" role="textbox" tabindex="1" type="search" /> 1067 """ 1068 1069 def __contains__(self, path): 1070 return all(path in sel for sel in self.selectors) 1071 1072 def __and__(self, other): 1073 return AndCombinator(*(self.selectors + (selector(other),))) 1074 1075 symbol = " & " 1076 1077 1078class NotCombinator(Combinator): 1079 """ 1080 A :class:`NotCombinator` inverts the selection logic of the underlying 1081 selector, i.e. a node matches only if it does not match the underlying 1082 selector. A :class:`NotCombinator` can be created with the unary inversion 1083 operator (``~``). 1084 1085 The following example outputs all internal scripts:: 1086 1087 >>> from ll.xist import xsc, parse, xfind 1088 >>> from ll.xist.ns import xml, html, chars 1089 >>> doc = parse.tree( 1090 ... parse.URL("https://www.python.org/"), 1091 ... parse.Tidy(), 1092 ... parse.NS(html), 1093 ... parse.Node(pool=xsc.Pool(xml, html, chars)) 1094 ... ) 1095 >>> for node in doc.walknodes(html.script & ~xfind.hasattr("src")): 1096 ... print(node.string()) 1097 ... 1098 <script type="text/javascript"> 1099 var _gaq = _gaq || []; 1100 _gaq.push(['_setAccount', 'UA-39055973-1']); 1101 _gaq.push(['_trackPageview']); 1102 1103 (function() { 1104 var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true; 1105 ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js'; 1106 var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s); 1107 })(); 1108 </script> 1109 <script>window.jQuery || document.write('<script src="/static/js/libs/jquery-1.8.2.min.js"><\/script>')</script> 1110 """ 1111 1112 def __init__(self, selector): 1113 self.selector = selector 1114 1115 def __contains__(self, path): 1116 return path not in self.selector 1117 1118 def __str__(self): 1119 if isinstance(self.selector, Combinator) and not isinstance(self.selector, NotCombinator): 1120 return f"~({self.selector})" 1121 else: 1122 return f"~{self.selector}" 1123 1124 1125class CallableSelector(Selector): 1126 """ 1127 A :class:`CallableSelector` is a selector that calls a user specified 1128 callable to select nodes. The callable gets passed the path and must return 1129 a bool specifying whether this path is selected. A :class:`CallableSelector` 1130 is created implicitely whenever a callable is passed to a method that 1131 expects a selector. 1132 1133 The following example outputs all links that point outside the ``python.org`` 1134 domain:: 1135 1136 >>> from ll.xist import xsc, parse, xfind 1137 >>> from ll.xist.ns import xml, html, chars 1138 >>> doc = parse.tree( 1139 ... parse.URL("https://www.python.org/"), 1140 ... parse.Tidy(), 1141 ... parse.NS(html), 1142 ... parse.Node(pool=xsc.Pool(xml, html, chars)) 1143 ... ) 1144 >>> def isextlink(path): 1145 ... return isinstance(path[-1], html.a) and not str(path[-1].attrs.href).startswith("https://www.python.org") 1146 ... 1147 >>> for node in doc.walknodes(isextlink): 1148 ... print(node.string()) 1149 ... 1150 <a href="http://docs.python.org/" title="Python Documentation">Docs</a> 1151 <a href="https://pypi.python.org/" title="Python Package Index">PyPI</a> 1152 <a class="text-shrink" href="javascript:;" title="Make Text Smaller">Smaller</a> 1153 <a class="text-grow" href="javascript:;" title="Make Text Larger">Larger</a> 1154 .. 1155 """ 1156 1157 def __init__(self, func): 1158 self.func = func 1159 1160 def __contains__(self, path): 1161 return self.func(path) 1162 1163 def __str__(self): 1164 return f"{self.__class__.__qualname__}({self.func!r})" 1165 1166 1167class nthchild(Selector): 1168 """ 1169 An :class:`nthchild` object is a selector that selects every node that is 1170 the n-th child of its parent. E.g. ``nthchild(0)`` selects every first 1171 child, ``nthchild(-1)`` selects each last child. Furthermore 1172 ``nthchild("even")`` selects each first, third, fifth, ... child and 1173 ``nthchild("odd")`` selects each second, fourth, sixth, ... child. 1174 """ 1175 1176 def __init__(self, index): 1177 self.index = index 1178 1179 def __contains__(self, path): 1180 if len(path) > 1: 1181 if self.index in ("even", "odd"): 1182 for (i, child) in enumerate(path[-2]): 1183 if child is path[-1]: 1184 return (i % 2) == (self.index == "odd") 1185 else: 1186 try: 1187 return path[-2][self.index] is path[-1] 1188 except IndexError: 1189 return False 1190 return False 1191 1192 def __str__(self): 1193 return f"{self.__class__.__qualname__}({self.index!r})" 1194 1195 1196class nthoftype(Selector): 1197 """ 1198 An :class:`nthoftype` object is a selector that selects every node that is 1199 the n-th node of a specified type among its siblings. Similar to 1200 :class:`nthchild` :class:`nthoftype` supports negative and positive indices 1201 as well as ``"even"`` and ``"odd"``. Which types are checked can be passed 1202 explicitly. If no types are passed the type of the node itself is used:: 1203 1204 >>> from ll.xist import xsc, parse, xfind 1205 >>> from ll.xist.ns import xml, html, chars 1206 >>> doc = parse.tree( 1207 ... parse.URL("https://www.python.org/"), 1208 ... parse.Tidy(), 1209 ... parse.NS(html), 1210 ... parse.Node(pool=xsc.Pool(xml, html, chars)) 1211 ... ) 1212 >>> for node in doc.walknodes(xfind.nthoftype(0, html.h2)): 1213 ... print(node.string()) 1214 ... 1215 <h2 class="widget-title"><span aria-hidden="true" class="icon-get-started"></span>Get Started</h2> 1216 <h2 class="widget-title"><span aria-hidden="true" class="icon-download"></span>Download</h2> 1217 <h2 class="widget-title"><span aria-hidden="true" class="icon-documentation"></span>Docs</h2> 1218 <h2 class="widget-title"><span aria-hidden="true" class="icon-jobs"></span>Jobs</h2> 1219 ... 1220 """ 1221 1222 def __init__(self, index, *types): 1223 self.index = index 1224 self.types = types 1225 1226 def _find(self, path): 1227 types = self.types if self.types else path[-1].__class__ 1228 for child in path[-2]: 1229 if isinstance(child, types): 1230 yield child 1231 1232 def __contains__(self, path): 1233 if len(path) > 1: 1234 if self.index in ("even", "odd"): 1235 for (i, child) in enumerate(self._find(path)): 1236 if child is path[-1]: 1237 return (i % 2) == (self.index == "odd") 1238 else: 1239 try: 1240 return misc.item(self._find(path), self.index) is path[-1] 1241 except IndexError: 1242 return False 1243 return False 1244 1245 def __str__(self): 1246 if self.types: 1247 types = ", ".join(f"{type.__module__}.{type.__qualname__}" for type in self.types) 1248 return f"{self.__class__.__qualname__}({self.index!r}, {types})" 1249 else: 1250 return f"{self.__class__.__qualname__}({self.index!r})" 1251