1"""
2Item Loader
3
4See documentation in docs/topics/loaders.rst
5"""
6from contextlib import suppress
7
8from itemadapter import ItemAdapter
9from parsel.utils import extract_regex, flatten
10
11from itemloaders.common import wrap_loader_context
12from itemloaders.processors import Identity
13from itemloaders.utils import arg_to_iter
14
15
16def unbound_method(method):
17    """
18    Allow to use single-argument functions as input or output processors
19    (no need to define an unused first 'self' argument)
20    """
21    with suppress(AttributeError):
22        if '.' not in method.__qualname__:
23            return method.__func__
24    return method
25
26
27class ItemLoader:
28    """
29    Return a new Item Loader for populating the given item. If no item is
30    given, one is instantiated automatically using the class in
31    :attr:`default_item_class`.
32
33    When instantiated with a :param ``selector`` parameter the :class:`ItemLoader` class
34    provides convenient mechanisms for extracting data from web pages
35    using parsel_ selectors.
36
37    :param item: The item instance to populate using subsequent calls to
38        :meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css`,
39        or :meth:`~ItemLoader.add_value`.
40    :type item: :class:`dict` object
41
42    :param selector: The selector to extract data from, when using the
43        :meth:`add_xpath` (resp. :meth:`add_css`) or :meth:`replace_xpath`
44        (resp. :meth:`replace_css`) method.
45    :type selector: :class:`~parsel.selector.Selector` object
46
47    The item, selector and the remaining keyword arguments are
48    assigned to the Loader context (accessible through the :attr:`context` attribute).
49
50    .. attribute:: item
51
52        The item object being parsed by this Item Loader.
53        This is mostly used as a property so when attempting to override this
54        value, you may want to check out :attr:`default_item_class` first.
55
56    .. attribute:: context
57
58        The currently active :ref:`Context <loaders-context>` of this Item Loader.
59        Refer to <loaders-context> for more information about the Loader Context.
60
61    .. attribute:: default_item_class
62
63        An Item class (or factory), used to instantiate items when not given in
64        the ``__init__`` method.
65
66        .. warning:: Currently, this factory/class needs to be
67            callable/instantiated without any arguments.
68            If you are using ``dataclasses``, please consider the following
69            alternative::
70
71                from dataclasses import dataclass, field
72                from typing import Optional
73
74                @dataclass
75                class Product:
76                    name: Optional[str] = field(default=None)
77                    price: Optional[float] = field(default=None)
78
79    .. attribute:: default_input_processor
80
81        The default input processor to use for those fields which don't specify
82        one.
83
84    .. attribute:: default_output_processor
85
86        The default output processor to use for those fields which don't specify
87        one.
88
89    .. attribute:: selector
90
91        The :class:`~parsel.selector.Selector` object to extract data from.
92        It's the selector given in the ``__init__`` method.
93        This attribute is meant to be read-only.
94
95    .. _parsel: https://parsel.readthedocs.io/en/latest/
96    """
97
98    default_item_class = dict
99    default_input_processor = Identity()
100    default_output_processor = Identity()
101
102    def __init__(self, item=None, selector=None, parent=None, **context):
103        self.selector = selector
104        context.update(selector=selector)
105        if item is None:
106            item = self.default_item_class()
107        self._local_item = item
108        context['item'] = item
109        self.context = context
110        self.parent = parent
111        self._local_values = {}
112        # values from initial item
113        for field_name, value in ItemAdapter(item).items():
114            self._values.setdefault(field_name, [])
115            self._values[field_name] += arg_to_iter(value)
116
117    @property
118    def _values(self):
119        if self.parent is not None:
120            return self.parent._values
121        else:
122            return self._local_values
123
124    @property
125    def item(self):
126        if self.parent is not None:
127            return self.parent.item
128        else:
129            return self._local_item
130
131    def nested_xpath(self, xpath, **context):
132        """
133        Create a nested loader with an xpath selector.
134        The supplied selector is applied relative to selector associated
135        with this :class:`ItemLoader`. The nested loader shares the item
136        with the parent :class:`ItemLoader` so calls to :meth:`add_xpath`,
137        :meth:`add_value`, :meth:`replace_value`, etc. will behave as expected.
138        """
139        selector = self.selector.xpath(xpath)
140        context.update(selector=selector)
141        subloader = self.__class__(
142            item=self.item, parent=self, **context
143        )
144        return subloader
145
146    def nested_css(self, css, **context):
147        """
148        Create a nested loader with a css selector.
149        The supplied selector is applied relative to selector associated
150        with this :class:`ItemLoader`. The nested loader shares the item
151        with the parent :class:`ItemLoader` so calls to :meth:`add_xpath`,
152        :meth:`add_value`, :meth:`replace_value`, etc. will behave as expected.
153        """
154        selector = self.selector.css(css)
155        context.update(selector=selector)
156        subloader = self.__class__(
157            item=self.item, parent=self, **context
158        )
159        return subloader
160
161    def add_value(self, field_name, value, *processors, **kw):
162        """
163        Process and then add the given ``value`` for the given field.
164
165        The value is first passed through :meth:`get_value` by giving the
166        ``processors`` and ``kwargs``, and then passed through the
167        :ref:`field input processor <processors>` and its result
168        appended to the data collected for that field. If the field already
169        contains collected data, the new data is added.
170
171        The given ``field_name`` can be ``None``, in which case values for
172        multiple fields may be added. And the processed value should be a dict
173        with field_name mapped to values.
174
175        Examples::
176
177            loader.add_value('name', 'Color TV')
178            loader.add_value('colours', ['white', 'blue'])
179            loader.add_value('length', '100')
180            loader.add_value('name', 'name: foo', TakeFirst(), re='name: (.+)')
181            loader.add_value(None, {'name': 'foo', 'sex': 'male'})
182        """
183        value = self.get_value(value, *processors, **kw)
184        if value is None:
185            return
186        if not field_name:
187            for k, v in value.items():
188                self._add_value(k, v)
189        else:
190            self._add_value(field_name, value)
191
192    def replace_value(self, field_name, value, *processors, **kw):
193        """
194        Similar to :meth:`add_value` but replaces the collected data with the
195        new value instead of adding it.
196        """
197        value = self.get_value(value, *processors, **kw)
198        if value is None:
199            return
200        if not field_name:
201            for k, v in value.items():
202                self._replace_value(k, v)
203        else:
204            self._replace_value(field_name, value)
205
206    def _add_value(self, field_name, value):
207        value = arg_to_iter(value)
208        processed_value = self._process_input_value(field_name, value)
209        if processed_value:
210            self._values.setdefault(field_name, [])
211            self._values[field_name] += arg_to_iter(processed_value)
212
213    def _replace_value(self, field_name, value):
214        self._values.pop(field_name, None)
215        self._add_value(field_name, value)
216
217    def get_value(self, value, *processors, **kw):
218        """
219        Process the given ``value`` by the given ``processors`` and keyword
220        arguments.
221
222        Available keyword arguments:
223
224        :param re: a regular expression to use for extracting data from the
225            given value using :func:`~parsel.utils.extract_regex` method,
226            applied before processors
227        :type re: str or typing.Pattern
228
229        Examples:
230
231        >>> from itemloaders import ItemLoader
232        >>> from itemloaders.processors import TakeFirst
233        >>> loader = ItemLoader()
234        >>> loader.get_value('name: foo', TakeFirst(), str.upper, re='name: (.+)')
235        'FOO'
236        """
237        regex = kw.get('re', None)
238        if regex:
239            value = arg_to_iter(value)
240            value = flatten(extract_regex(regex, x) for x in value)
241
242        for proc in processors:
243            if value is None:
244                break
245            _proc = proc
246            proc = wrap_loader_context(proc, self.context)
247            try:
248                value = proc(value)
249            except Exception as e:
250                raise ValueError("Error with processor %s value=%r error='%s: %s'" %
251                                 (_proc.__class__.__name__, value,
252                                  type(e).__name__, str(e)))
253        return value
254
255    def load_item(self):
256        """
257        Populate the item with the data collected so far, and return it. The
258        data collected is first passed through the :ref:`output processors
259        <processors>` to get the final value to assign to each item field.
260        """
261        adapter = ItemAdapter(self.item)
262        for field_name in tuple(self._values):
263            value = self.get_output_value(field_name)
264            if value is not None:
265                adapter[field_name] = value
266
267        return adapter.item
268
269    def get_output_value(self, field_name):
270        """
271        Return the collected values parsed using the output processor, for the
272        given field. This method doesn't populate or modify the item at all.
273        """
274        proc = self.get_output_processor(field_name)
275        proc = wrap_loader_context(proc, self.context)
276        value = self._values.get(field_name, [])
277        try:
278            return proc(value)
279        except Exception as e:
280            raise ValueError("Error with output processor: field=%r value=%r error='%s: %s'" %
281                             (field_name, value, type(e).__name__, str(e)))
282
283    def get_collected_values(self, field_name):
284        """Return the collected values for the given field."""
285        return self._values.get(field_name, [])
286
287    def get_input_processor(self, field_name):
288        proc = getattr(self, '%s_in' % field_name, None)
289        if not proc:
290            proc = self._get_item_field_attr(
291                field_name,
292                'input_processor',
293                self.default_input_processor
294            )
295        return unbound_method(proc)
296
297    def get_output_processor(self, field_name):
298        proc = getattr(self, '%s_out' % field_name, None)
299        if not proc:
300            proc = self._get_item_field_attr(
301                field_name,
302                'output_processor',
303                self.default_output_processor
304            )
305        return unbound_method(proc)
306
307    def _get_item_field_attr(self, field_name, key, default=None):
308        field_meta = ItemAdapter(self.item).get_field_meta(field_name)
309        return field_meta.get(key, default)
310
311    def _process_input_value(self, field_name, value):
312        proc = self.get_input_processor(field_name)
313        _proc = proc
314        proc = wrap_loader_context(proc, self.context)
315        try:
316            return proc(value)
317        except Exception as e:
318            raise ValueError(
319                "Error with input processor %s: field=%r value=%r "
320                "error='%s: %s'" % (_proc.__class__.__name__, field_name,
321                                    value, type(e).__name__, str(e)))
322
323    def _check_selector_method(self):
324        if self.selector is None:
325            raise RuntimeError(
326                "To use XPath or CSS selectors, %s"
327                "must be instantiated with a selector" % self.__class__.__name__
328            )
329
330    def add_xpath(self, field_name, xpath, *processors, **kw):
331        """
332        Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a
333        value, which is used to extract a list of strings from the
334        selector associated with this :class:`ItemLoader`.
335
336        See :meth:`get_xpath` for ``kwargs``.
337
338        :param xpath: the XPath to extract data from
339        :type xpath: str
340
341        Examples::
342
343            # HTML snippet: <p class="product-name">Color TV</p>
344            loader.add_xpath('name', '//p[@class="product-name"]')
345            # HTML snippet: <p id="price">the price is $1200</p>
346            loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)')
347
348        """
349        values = self._get_xpathvalues(xpath, **kw)
350        self.add_value(field_name, values, *processors, **kw)
351
352    def replace_xpath(self, field_name, xpath, *processors, **kw):
353        """
354        Similar to :meth:`add_xpath` but replaces collected data instead of adding it.
355        """
356        values = self._get_xpathvalues(xpath, **kw)
357        self.replace_value(field_name, values, *processors, **kw)
358
359    def get_xpath(self, xpath, *processors, **kw):
360        """
361        Similar to :meth:`ItemLoader.get_value` but receives an XPath instead of a
362        value, which is used to extract a list of unicode strings from the
363        selector associated with this :class:`ItemLoader`.
364
365        :param xpath: the XPath to extract data from
366        :type xpath: str
367
368        :param re: a regular expression to use for extracting data from the
369            selected XPath region
370        :type re: str or typing.Pattern
371
372        Examples::
373
374            # HTML snippet: <p class="product-name">Color TV</p>
375            loader.get_xpath('//p[@class="product-name"]')
376            # HTML snippet: <p id="price">the price is $1200</p>
377            loader.get_xpath('//p[@id="price"]', TakeFirst(), re='the price is (.*)')
378
379        """
380        values = self._get_xpathvalues(xpath, **kw)
381        return self.get_value(values, *processors, **kw)
382
383    def _get_xpathvalues(self, xpaths, **kw):
384        self._check_selector_method()
385        xpaths = arg_to_iter(xpaths)
386        return flatten(self.selector.xpath(xpath).getall() for xpath in xpaths)
387
388    def add_css(self, field_name, css, *processors, **kw):
389        """
390        Similar to :meth:`ItemLoader.add_value` but receives a CSS selector
391        instead of a value, which is used to extract a list of unicode strings
392        from the selector associated with this :class:`ItemLoader`.
393
394        See :meth:`get_css` for ``kwargs``.
395
396        :param css: the CSS selector to extract data from
397        :type css: str
398
399        Examples::
400
401            # HTML snippet: <p class="product-name">Color TV</p>
402            loader.add_css('name', 'p.product-name')
403            # HTML snippet: <p id="price">the price is $1200</p>
404            loader.add_css('price', 'p#price', re='the price is (.*)')
405        """
406        values = self._get_cssvalues(css, **kw)
407        self.add_value(field_name, values, *processors, **kw)
408
409    def replace_css(self, field_name, css, *processors, **kw):
410        """
411        Similar to :meth:`add_css` but replaces collected data instead of adding it.
412        """
413        values = self._get_cssvalues(css, **kw)
414        self.replace_value(field_name, values, *processors, **kw)
415
416    def get_css(self, css, *processors, **kw):
417        """
418        Similar to :meth:`ItemLoader.get_value` but receives a CSS selector
419        instead of a value, which is used to extract a list of unicode strings
420        from the selector associated with this :class:`ItemLoader`.
421
422        :param css: the CSS selector to extract data from
423        :type css: str
424
425        :param re: a regular expression to use for extracting data from the
426            selected CSS region
427        :type re: str or typing.Pattern
428
429        Examples::
430
431            # HTML snippet: <p class="product-name">Color TV</p>
432            loader.get_css('p.product-name')
433            # HTML snippet: <p id="price">the price is $1200</p>
434            loader.get_css('p#price', TakeFirst(), re='the price is (.*)')
435        """
436        values = self._get_cssvalues(css, **kw)
437        return self.get_value(values, *processors, **kw)
438
439    def _get_cssvalues(self, csss, **kw):
440        self._check_selector_method()
441        csss = arg_to_iter(csss)
442        return flatten(self.selector.css(css).getall() for css in csss)
443