1""" 2XPath selectors based on lxml 3""" 4 5from parsel import Selector as _ParselSelector 6from scrapy.utils.trackref import object_ref 7from scrapy.utils.python import to_bytes 8from scrapy.http import HtmlResponse, XmlResponse 9 10 11__all__ = ['Selector', 'SelectorList'] 12 13 14def _st(response, st): 15 if st is None: 16 return 'xml' if isinstance(response, XmlResponse) else 'html' 17 return st 18 19 20def _response_from_text(text, st): 21 rt = XmlResponse if st == 'xml' else HtmlResponse 22 return rt(url='about:blank', encoding='utf-8', 23 body=to_bytes(text, 'utf-8')) 24 25 26class SelectorList(_ParselSelector.selectorlist_cls, object_ref): 27 """ 28 The :class:`SelectorList` class is a subclass of the builtin ``list`` 29 class, which provides a few additional methods. 30 """ 31 32 33class Selector(_ParselSelector, object_ref): 34 """ 35 An instance of :class:`Selector` is a wrapper over response to select 36 certain parts of its content. 37 38 ``response`` is an :class:`~scrapy.http.HtmlResponse` or an 39 :class:`~scrapy.http.XmlResponse` object that will be used for selecting 40 and extracting data. 41 42 ``text`` is a unicode string or utf-8 encoded text for cases when a 43 ``response`` isn't available. Using ``text`` and ``response`` together is 44 undefined behavior. 45 46 ``type`` defines the selector type, it can be ``"html"``, ``"xml"`` 47 or ``None`` (default). 48 49 If ``type`` is ``None``, the selector automatically chooses the best type 50 based on ``response`` type (see below), or defaults to ``"html"`` in case it 51 is used together with ``text``. 52 53 If ``type`` is ``None`` and a ``response`` is passed, the selector type is 54 inferred from the response type as follows: 55 56 * ``"html"`` for :class:`~scrapy.http.HtmlResponse` type 57 * ``"xml"`` for :class:`~scrapy.http.XmlResponse` type 58 * ``"html"`` for anything else 59 60 Otherwise, if ``type`` is set, the selector type will be forced and no 61 detection will occur. 62 """ 63 64 __slots__ = ['response'] 65 selectorlist_cls = SelectorList 66 67 def __init__(self, response=None, text=None, type=None, root=None, **kwargs): 68 if response is not None and text is not None: 69 raise ValueError(f'{self.__class__.__name__}.__init__() received ' 70 'both response and text') 71 72 st = _st(response, type) 73 74 if text is not None: 75 response = _response_from_text(text, st) 76 77 if response is not None: 78 text = response.text 79 kwargs.setdefault('base_url', response.url) 80 81 self.response = response 82 super().__init__(text=text, type=st, root=root, **kwargs) 83