1""" 2Item Loader 3 4See documentation in docs/topics/loaders.rst 5""" 6from contextlib import suppress 7 8from itemadapter import ItemAdapter 9from parsel.utils import extract_regex, flatten 10 11from itemloaders.common import wrap_loader_context 12from itemloaders.processors import Identity 13from itemloaders.utils import arg_to_iter 14 15 16def unbound_method(method): 17 """ 18 Allow to use single-argument functions as input or output processors 19 (no need to define an unused first 'self' argument) 20 """ 21 with suppress(AttributeError): 22 if '.' not in method.__qualname__: 23 return method.__func__ 24 return method 25 26 27class ItemLoader: 28 """ 29 Return a new Item Loader for populating the given item. If no item is 30 given, one is instantiated automatically using the class in 31 :attr:`default_item_class`. 32 33 When instantiated with a :param ``selector`` parameter the :class:`ItemLoader` class 34 provides convenient mechanisms for extracting data from web pages 35 using parsel_ selectors. 36 37 :param item: The item instance to populate using subsequent calls to 38 :meth:`~ItemLoader.add_xpath`, :meth:`~ItemLoader.add_css`, 39 or :meth:`~ItemLoader.add_value`. 40 :type item: :class:`dict` object 41 42 :param selector: The selector to extract data from, when using the 43 :meth:`add_xpath` (resp. :meth:`add_css`) or :meth:`replace_xpath` 44 (resp. :meth:`replace_css`) method. 45 :type selector: :class:`~parsel.selector.Selector` object 46 47 The item, selector and the remaining keyword arguments are 48 assigned to the Loader context (accessible through the :attr:`context` attribute). 49 50 .. attribute:: item 51 52 The item object being parsed by this Item Loader. 53 This is mostly used as a property so when attempting to override this 54 value, you may want to check out :attr:`default_item_class` first. 55 56 .. attribute:: context 57 58 The currently active :ref:`Context <loaders-context>` of this Item Loader. 59 Refer to <loaders-context> for more information about the Loader Context. 60 61 .. attribute:: default_item_class 62 63 An Item class (or factory), used to instantiate items when not given in 64 the ``__init__`` method. 65 66 .. warning:: Currently, this factory/class needs to be 67 callable/instantiated without any arguments. 68 If you are using ``dataclasses``, please consider the following 69 alternative:: 70 71 from dataclasses import dataclass, field 72 from typing import Optional 73 74 @dataclass 75 class Product: 76 name: Optional[str] = field(default=None) 77 price: Optional[float] = field(default=None) 78 79 .. attribute:: default_input_processor 80 81 The default input processor to use for those fields which don't specify 82 one. 83 84 .. attribute:: default_output_processor 85 86 The default output processor to use for those fields which don't specify 87 one. 88 89 .. attribute:: selector 90 91 The :class:`~parsel.selector.Selector` object to extract data from. 92 It's the selector given in the ``__init__`` method. 93 This attribute is meant to be read-only. 94 95 .. _parsel: https://parsel.readthedocs.io/en/latest/ 96 """ 97 98 default_item_class = dict 99 default_input_processor = Identity() 100 default_output_processor = Identity() 101 102 def __init__(self, item=None, selector=None, parent=None, **context): 103 self.selector = selector 104 context.update(selector=selector) 105 if item is None: 106 item = self.default_item_class() 107 self._local_item = item 108 context['item'] = item 109 self.context = context 110 self.parent = parent 111 self._local_values = {} 112 # values from initial item 113 for field_name, value in ItemAdapter(item).items(): 114 self._values.setdefault(field_name, []) 115 self._values[field_name] += arg_to_iter(value) 116 117 @property 118 def _values(self): 119 if self.parent is not None: 120 return self.parent._values 121 else: 122 return self._local_values 123 124 @property 125 def item(self): 126 if self.parent is not None: 127 return self.parent.item 128 else: 129 return self._local_item 130 131 def nested_xpath(self, xpath, **context): 132 """ 133 Create a nested loader with an xpath selector. 134 The supplied selector is applied relative to selector associated 135 with this :class:`ItemLoader`. The nested loader shares the item 136 with the parent :class:`ItemLoader` so calls to :meth:`add_xpath`, 137 :meth:`add_value`, :meth:`replace_value`, etc. will behave as expected. 138 """ 139 selector = self.selector.xpath(xpath) 140 context.update(selector=selector) 141 subloader = self.__class__( 142 item=self.item, parent=self, **context 143 ) 144 return subloader 145 146 def nested_css(self, css, **context): 147 """ 148 Create a nested loader with a css selector. 149 The supplied selector is applied relative to selector associated 150 with this :class:`ItemLoader`. The nested loader shares the item 151 with the parent :class:`ItemLoader` so calls to :meth:`add_xpath`, 152 :meth:`add_value`, :meth:`replace_value`, etc. will behave as expected. 153 """ 154 selector = self.selector.css(css) 155 context.update(selector=selector) 156 subloader = self.__class__( 157 item=self.item, parent=self, **context 158 ) 159 return subloader 160 161 def add_value(self, field_name, value, *processors, **kw): 162 """ 163 Process and then add the given ``value`` for the given field. 164 165 The value is first passed through :meth:`get_value` by giving the 166 ``processors`` and ``kwargs``, and then passed through the 167 :ref:`field input processor <processors>` and its result 168 appended to the data collected for that field. If the field already 169 contains collected data, the new data is added. 170 171 The given ``field_name`` can be ``None``, in which case values for 172 multiple fields may be added. And the processed value should be a dict 173 with field_name mapped to values. 174 175 Examples:: 176 177 loader.add_value('name', 'Color TV') 178 loader.add_value('colours', ['white', 'blue']) 179 loader.add_value('length', '100') 180 loader.add_value('name', 'name: foo', TakeFirst(), re='name: (.+)') 181 loader.add_value(None, {'name': 'foo', 'sex': 'male'}) 182 """ 183 value = self.get_value(value, *processors, **kw) 184 if value is None: 185 return 186 if not field_name: 187 for k, v in value.items(): 188 self._add_value(k, v) 189 else: 190 self._add_value(field_name, value) 191 192 def replace_value(self, field_name, value, *processors, **kw): 193 """ 194 Similar to :meth:`add_value` but replaces the collected data with the 195 new value instead of adding it. 196 """ 197 value = self.get_value(value, *processors, **kw) 198 if value is None: 199 return 200 if not field_name: 201 for k, v in value.items(): 202 self._replace_value(k, v) 203 else: 204 self._replace_value(field_name, value) 205 206 def _add_value(self, field_name, value): 207 value = arg_to_iter(value) 208 processed_value = self._process_input_value(field_name, value) 209 if processed_value: 210 self._values.setdefault(field_name, []) 211 self._values[field_name] += arg_to_iter(processed_value) 212 213 def _replace_value(self, field_name, value): 214 self._values.pop(field_name, None) 215 self._add_value(field_name, value) 216 217 def get_value(self, value, *processors, **kw): 218 """ 219 Process the given ``value`` by the given ``processors`` and keyword 220 arguments. 221 222 Available keyword arguments: 223 224 :param re: a regular expression to use for extracting data from the 225 given value using :func:`~parsel.utils.extract_regex` method, 226 applied before processors 227 :type re: str or typing.Pattern 228 229 Examples: 230 231 >>> from itemloaders import ItemLoader 232 >>> from itemloaders.processors import TakeFirst 233 >>> loader = ItemLoader() 234 >>> loader.get_value('name: foo', TakeFirst(), str.upper, re='name: (.+)') 235 'FOO' 236 """ 237 regex = kw.get('re', None) 238 if regex: 239 value = arg_to_iter(value) 240 value = flatten(extract_regex(regex, x) for x in value) 241 242 for proc in processors: 243 if value is None: 244 break 245 _proc = proc 246 proc = wrap_loader_context(proc, self.context) 247 try: 248 value = proc(value) 249 except Exception as e: 250 raise ValueError("Error with processor %s value=%r error='%s: %s'" % 251 (_proc.__class__.__name__, value, 252 type(e).__name__, str(e))) 253 return value 254 255 def load_item(self): 256 """ 257 Populate the item with the data collected so far, and return it. The 258 data collected is first passed through the :ref:`output processors 259 <processors>` to get the final value to assign to each item field. 260 """ 261 adapter = ItemAdapter(self.item) 262 for field_name in tuple(self._values): 263 value = self.get_output_value(field_name) 264 if value is not None: 265 adapter[field_name] = value 266 267 return adapter.item 268 269 def get_output_value(self, field_name): 270 """ 271 Return the collected values parsed using the output processor, for the 272 given field. This method doesn't populate or modify the item at all. 273 """ 274 proc = self.get_output_processor(field_name) 275 proc = wrap_loader_context(proc, self.context) 276 value = self._values.get(field_name, []) 277 try: 278 return proc(value) 279 except Exception as e: 280 raise ValueError("Error with output processor: field=%r value=%r error='%s: %s'" % 281 (field_name, value, type(e).__name__, str(e))) 282 283 def get_collected_values(self, field_name): 284 """Return the collected values for the given field.""" 285 return self._values.get(field_name, []) 286 287 def get_input_processor(self, field_name): 288 proc = getattr(self, '%s_in' % field_name, None) 289 if not proc: 290 proc = self._get_item_field_attr( 291 field_name, 292 'input_processor', 293 self.default_input_processor 294 ) 295 return unbound_method(proc) 296 297 def get_output_processor(self, field_name): 298 proc = getattr(self, '%s_out' % field_name, None) 299 if not proc: 300 proc = self._get_item_field_attr( 301 field_name, 302 'output_processor', 303 self.default_output_processor 304 ) 305 return unbound_method(proc) 306 307 def _get_item_field_attr(self, field_name, key, default=None): 308 field_meta = ItemAdapter(self.item).get_field_meta(field_name) 309 return field_meta.get(key, default) 310 311 def _process_input_value(self, field_name, value): 312 proc = self.get_input_processor(field_name) 313 _proc = proc 314 proc = wrap_loader_context(proc, self.context) 315 try: 316 return proc(value) 317 except Exception as e: 318 raise ValueError( 319 "Error with input processor %s: field=%r value=%r " 320 "error='%s: %s'" % (_proc.__class__.__name__, field_name, 321 value, type(e).__name__, str(e))) 322 323 def _check_selector_method(self): 324 if self.selector is None: 325 raise RuntimeError( 326 "To use XPath or CSS selectors, %s" 327 "must be instantiated with a selector" % self.__class__.__name__ 328 ) 329 330 def add_xpath(self, field_name, xpath, *processors, **kw): 331 """ 332 Similar to :meth:`ItemLoader.add_value` but receives an XPath instead of a 333 value, which is used to extract a list of strings from the 334 selector associated with this :class:`ItemLoader`. 335 336 See :meth:`get_xpath` for ``kwargs``. 337 338 :param xpath: the XPath to extract data from 339 :type xpath: str 340 341 Examples:: 342 343 # HTML snippet: <p class="product-name">Color TV</p> 344 loader.add_xpath('name', '//p[@class="product-name"]') 345 # HTML snippet: <p id="price">the price is $1200</p> 346 loader.add_xpath('price', '//p[@id="price"]', re='the price is (.*)') 347 348 """ 349 values = self._get_xpathvalues(xpath, **kw) 350 self.add_value(field_name, values, *processors, **kw) 351 352 def replace_xpath(self, field_name, xpath, *processors, **kw): 353 """ 354 Similar to :meth:`add_xpath` but replaces collected data instead of adding it. 355 """ 356 values = self._get_xpathvalues(xpath, **kw) 357 self.replace_value(field_name, values, *processors, **kw) 358 359 def get_xpath(self, xpath, *processors, **kw): 360 """ 361 Similar to :meth:`ItemLoader.get_value` but receives an XPath instead of a 362 value, which is used to extract a list of unicode strings from the 363 selector associated with this :class:`ItemLoader`. 364 365 :param xpath: the XPath to extract data from 366 :type xpath: str 367 368 :param re: a regular expression to use for extracting data from the 369 selected XPath region 370 :type re: str or typing.Pattern 371 372 Examples:: 373 374 # HTML snippet: <p class="product-name">Color TV</p> 375 loader.get_xpath('//p[@class="product-name"]') 376 # HTML snippet: <p id="price">the price is $1200</p> 377 loader.get_xpath('//p[@id="price"]', TakeFirst(), re='the price is (.*)') 378 379 """ 380 values = self._get_xpathvalues(xpath, **kw) 381 return self.get_value(values, *processors, **kw) 382 383 def _get_xpathvalues(self, xpaths, **kw): 384 self._check_selector_method() 385 xpaths = arg_to_iter(xpaths) 386 return flatten(self.selector.xpath(xpath).getall() for xpath in xpaths) 387 388 def add_css(self, field_name, css, *processors, **kw): 389 """ 390 Similar to :meth:`ItemLoader.add_value` but receives a CSS selector 391 instead of a value, which is used to extract a list of unicode strings 392 from the selector associated with this :class:`ItemLoader`. 393 394 See :meth:`get_css` for ``kwargs``. 395 396 :param css: the CSS selector to extract data from 397 :type css: str 398 399 Examples:: 400 401 # HTML snippet: <p class="product-name">Color TV</p> 402 loader.add_css('name', 'p.product-name') 403 # HTML snippet: <p id="price">the price is $1200</p> 404 loader.add_css('price', 'p#price', re='the price is (.*)') 405 """ 406 values = self._get_cssvalues(css, **kw) 407 self.add_value(field_name, values, *processors, **kw) 408 409 def replace_css(self, field_name, css, *processors, **kw): 410 """ 411 Similar to :meth:`add_css` but replaces collected data instead of adding it. 412 """ 413 values = self._get_cssvalues(css, **kw) 414 self.replace_value(field_name, values, *processors, **kw) 415 416 def get_css(self, css, *processors, **kw): 417 """ 418 Similar to :meth:`ItemLoader.get_value` but receives a CSS selector 419 instead of a value, which is used to extract a list of unicode strings 420 from the selector associated with this :class:`ItemLoader`. 421 422 :param css: the CSS selector to extract data from 423 :type css: str 424 425 :param re: a regular expression to use for extracting data from the 426 selected CSS region 427 :type re: str or typing.Pattern 428 429 Examples:: 430 431 # HTML snippet: <p class="product-name">Color TV</p> 432 loader.get_css('p.product-name') 433 # HTML snippet: <p id="price">the price is $1200</p> 434 loader.get_css('p#price', TakeFirst(), re='the price is (.*)') 435 """ 436 values = self._get_cssvalues(css, **kw) 437 return self.get_value(values, *processors, **kw) 438 439 def _get_cssvalues(self, csss, **kw): 440 self._check_selector_method() 441 csss = arg_to_iter(csss) 442 return flatten(self.selector.css(css).getall() for css in csss) 443