Source code for zyte_spider_templates.spiders.ecommerce

from __future__ import annotations

from enum import Enum
from typing import (
    TYPE_CHECKING,
    Any,
    Callable,
    Dict,
    Iterable,
    List,
    Optional,
    TypeVar,
    Union,
    cast,
)

import scrapy
from pydantic import BaseModel, ConfigDict, Field, model_validator
from scrapy.crawler import Crawler
from scrapy_poet import DummyResponse, DynamicDeps
from scrapy_spider_metadata import Args
from web_poet.page_inputs.browser import BrowserResponse
from zyte_common_items import (
    CustomAttributes,
    Item,
    ProbabilityRequest,
    Product,
    ProductList,
    ProductNavigation,
    SearchRequestTemplate,
)

from zyte_spider_templates.heuristics import is_homepage
from zyte_spider_templates.params import ExtractFrom, parse_input_params
from zyte_spider_templates.spiders.base import (
    ARG_SETTING_PRIORITY,
    INPUT_GROUP,
    BaseSpider,
)
from zyte_spider_templates.utils import get_domain

from ..documentation import document_enum
from ..params import (
    CustomAttrsInputParam,
    CustomAttrsMethodParam,
    ExtractFromParam,
    GeolocationParam,
    MaxRequestsParam,
    SearchQueriesParam,
    UrlParam,
    UrlsFileParam,
    UrlsParam,
)

if TYPE_CHECKING:
    # typing.Self requires Python 3.11
    from typing_extensions import Self


ItemTV = TypeVar("ItemTV", bound=Item)


[docs] @document_enum class EcommerceCrawlStrategy(str, Enum): automatic: str = "automatic" """ Automatically select the best approach. A good default for most use cases. Currently it uses heuristics only on the homepages of websites (similar to Full strategy), and follows product, category and pagination links on other pages (similar to Navigation strategy). """ full: str = "full" """ Follow most links on the website to discover and extract as many products as possible. If an input URL is a link to a particular category on a website, the spider may crawl products outside this category. Try this strategy if other strategies miss items. """ navigation: str = "navigation" """ Follow pagination, subcategories, and product links only. If an input URL is a link to a particular category on a website, the spider will try to stay within this category. """ pagination_only: str = "pagination_only" """ Follow pagination and product links only. This strategy is similar to Navigation, but it doesn't support subcategories. Use it when you need the spider to stay within a certain category on a website, but Automatic or Navigation strategies fail to do so because of misclassified subcategory links. """ direct_item: str = "direct_item" """ Directly extract items from the provided URLs, without any crawling. To use this strategy, pass to the spider individual product or product list URLs (in line with the extract spider parameter value). Common use cases are product monitoring and batch extraction. """
[docs] class EcommerceCrawlStrategyParam(BaseModel): crawl_strategy: EcommerceCrawlStrategy = Field( title="Crawl strategy", description="Determines how the start URL and follow-up URLs are crawled.", default=EcommerceCrawlStrategy.automatic, json_schema_extra={ "enumMeta": { EcommerceCrawlStrategy.automatic: { "description": ( "Automatically select the best approach. A good " "default for most use cases. Currently it uses " "heuristics only on the homepages of websites (similar " "to Full strategy), and follows product, category and " "pagination links on other pages (similar to Navigation " "strategy)." ), "title": "Automatic", }, EcommerceCrawlStrategy.full: { "title": "Full", "description": ( "Follow most links on the website to discover and " "extract as many products as possible. If an input URL " "is a link to a particular category on a website, the " "spider may crawl products outside this category. Try " "this strategy if other strategies miss items." ), }, EcommerceCrawlStrategy.navigation: { "title": "Navigation", "description": ( "Follow pagination, subcategories, and product links " "only. If an input URL is a link to a particular " "category on a website, the spider will try to stay " "within this category." ), }, EcommerceCrawlStrategy.pagination_only: { "title": "Pagination Only", "description": ( "Follow pagination and product links only. This " "strategy is similar to Navigation, but it doesn't " "support subcategories. Use it when you need the " "spider to stay within a certain category on a " "website, but Automatic or Navigation strategies fail " "to do so because of misclassified subcategory links." ), }, EcommerceCrawlStrategy.direct_item: { "title": "Direct URLs", "description": ( "Directly extract items from the provided URLs, " "without any crawling. To use this strategy, pass to " "the spider individual product or product list URLs " "(in line with the extract spider parameter value). " "Common use cases are product monitoring and batch " "extraction." ), }, }, }, )
[docs] @document_enum class EcommerceExtract(str, Enum): product: str = "product" """ Product data from product detail pages. """ productList: str = "productList" """ Product list data from product listing pages (e.g. category pages). """
[docs] class EcommerceExtractParam(BaseModel): extract: EcommerceExtract = Field( title="Extract", description="Data to return.", default=EcommerceExtract.product, )
class EcommerceSearchQueriesParam(SearchQueriesParam): search_queries: List[str] = Field( title="Search Queries", description=( "A list of search queries, one per line, to submit using the " "search form found on each input URL. Only works for input URLs " "that support search. May not work on every website. Search " 'queries are not compatible with the "full" and "navigation" ' "crawl strategies, and when extracting products, they are not " 'compatible with the "direct_item" crawl strategy either.' ), default_factory=list, json_schema_extra={ "default": [], "widget": "textarea", }, )
[docs] class EcommerceSpiderParams( CustomAttrsMethodParam, CustomAttrsInputParam, ExtractFromParam, MaxRequestsParam, GeolocationParam, EcommerceCrawlStrategyParam, EcommerceExtractParam, EcommerceSearchQueriesParam, UrlsFileParam, UrlsParam, UrlParam, BaseModel, ): model_config = ConfigDict( json_schema_extra={ "groups": [ INPUT_GROUP, ], }, ) @model_validator(mode="after") def validate_search_queries_and_strategy(self): if self.search_queries and self.crawl_strategy in { EcommerceCrawlStrategy.full, EcommerceCrawlStrategy.navigation, }: raise ValueError( f"Cannot combine the {self.crawl_strategy.value!r} value of " f"the crawl_strategy spider parameter with the search_queries " f"spider parameter." ) if ( self.search_queries and self.crawl_strategy == EcommerceCrawlStrategy.direct_item and self.extract != EcommerceExtract.productList ): raise ValueError( f"Cannot combine the {self.crawl_strategy.value!r} value of " f"the crawl_strategy spider parameter with the search_queries " f"spider parameter unless the extract spider parameter is " f"{EcommerceExtract.productList.value!r}." ) return self
[docs] class EcommerceSpider(Args[EcommerceSpiderParams], BaseSpider): """Yield products from an e-commerce website. See :class:`~zyte_spider_templates.spiders.ecommerce.EcommerceSpiderParams` for supported parameters. .. seealso:: :ref:`e-commerce`. """ name = "ecommerce" metadata: Dict[str, Any] = { **BaseSpider.metadata, "title": "E-commerce", "description": "Template for spiders that extract product data from e-commerce websites.", } @classmethod def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> Self: spider = super(EcommerceSpider, cls).from_crawler(crawler, *args, **kwargs) parse_input_params(spider) spider._init_extract_from() return spider def _init_extract_from(self): if self.args.extract_from is not None: self.settings.set( "ZYTE_API_PROVIDER_PARAMS", { "productOptions": {"extractFrom": self.args.extract_from}, "productNavigationOptions": {"extractFrom": self.args.extract_from}, "productListOptions": {"extractFrom": self.args.extract_from}, **self.settings.get("ZYTE_API_PROVIDER_PARAMS", {}), }, priority=ARG_SETTING_PRIORITY, ) def get_start_request(self, url): callback = ( self.parse_product if self.args.crawl_strategy == EcommerceCrawlStrategy.direct_item and self.args.extract == EcommerceExtract.product else self.parse_navigation ) meta: Dict[str, Any] = { "crawling_logs": { "page_type": self.args.extract.value if self.args.crawl_strategy == EcommerceCrawlStrategy.direct_item else "productNavigation" }, } if ( self.args.crawl_strategy == EcommerceCrawlStrategy.direct_item or self.args.extract == EcommerceExtract.productList ) and self._custom_attrs_dep: meta.setdefault("inject", []).append(self._custom_attrs_dep) if self.args.extract == EcommerceExtract.productList: meta.setdefault("inject", []).append(ProductList) if self.args.crawl_strategy == EcommerceCrawlStrategy.full: meta["page_params"] = {"full_domain": get_domain(url)} elif self.args.crawl_strategy == EcommerceCrawlStrategy.automatic: if is_homepage(url): meta["page_params"] = {"full_domain": get_domain(url)} self.logger.info( f"[Automatic Strategy] The input URL {url} seems to be a homepage. " f"Heuristics will be used on it to crawl other pages which might have products." ) else: self.logger.info( f"[Automatic Strategy] The input URL {url} doesn't seem to be a homepage. " f"Heuristics won't be used to crawl other pages which might have products." ) return scrapy.Request( url=url, callback=callback, meta=meta, ) def start_requests(self) -> Iterable[scrapy.Request]: if self.args.search_queries: for url in self.start_urls: meta: Dict[str, Any] = { "crawling_logs": {"page_type": "searchRequestTemplate"}, } if self.args.extract_from == ExtractFrom.browserHtml: meta["inject"] = [BrowserResponse] with self._log_request_exception: yield scrapy.Request( url=url, callback=self.parse_search_request_template, meta=meta, ) else: for url in self.start_urls: with self._log_request_exception: yield self.get_start_request(url) def parse_search_request_template( self, response: DummyResponse, search_request_template: SearchRequestTemplate, dynamic: DynamicDeps, ) -> Iterable[scrapy.Request]: probability = search_request_template.get_probability() if probability is not None and probability <= 0: return for query in self.args.search_queries: meta: Dict[str, Any] = { "crawling_logs": {"page_type": "productNavigation"}, } if self.args.extract == EcommerceExtract.productList: meta["inject"] = [ProductList] if self._custom_attrs_dep: meta["inject"].append(self._custom_attrs_dep) with self._log_request_exception: yield search_request_template.request(query=query).to_scrapy( callback=self.parse_navigation, meta=meta, ) def parse_navigation( self, response: DummyResponse, navigation: ProductNavigation, dynamic: DynamicDeps, ) -> Iterable[ Union[ scrapy.Request, ProductList, Dict[str, Union[ProductList, Optional[CustomAttributes]]], ] ]: page_params = self._modify_page_params_for_heuristics( response.meta.get("page_params") ) products = navigation.items or [] if self.args.extract == EcommerceExtract.product: for request in products: with self._log_request_exception: yield self.get_parse_product_request(request) if ( self.args.crawl_strategy != EcommerceCrawlStrategy.direct_item and navigation.nextPage ): if not products: self.logger.info( f"Ignoring nextPage link {navigation.nextPage} since there " f"are no product links found in {navigation.url}" ) else: with self._log_request_exception: yield self.get_nextpage_request( cast(ProbabilityRequest, navigation.nextPage) ) if ( self.args.crawl_strategy not in { EcommerceCrawlStrategy.direct_item, EcommerceCrawlStrategy.pagination_only, } and not self.args.search_queries ): for request in navigation.subCategories or []: with self._log_request_exception: yield self.get_subcategory_request(request, page_params=page_params) if self.args.extract == EcommerceExtract.productList: product_list: ProductList = dynamic[ProductList] if ( item := self._produce_item( product_list, "productList", response.url, dynamic.get(CustomAttributes), ) ) is not None: yield item def _produce_item( self, api_item: ItemTV, name: str, url: str, custom_attrs: Optional[CustomAttributes], ) -> Union[ItemTV, Dict[str, Union[ItemTV, Optional[CustomAttributes]]], None]: probability = api_item.get_probability() # TODO: convert to a configurable parameter later on after the launch if probability is None or probability >= 0.1: if self.args.custom_attrs_input: return { name: api_item, "customAttributes": custom_attrs, } else: return api_item assert self.crawler.stats self.crawler.stats.inc_value(f"drop_item/{name}/low_probability") self.logger.info( f"Ignoring item from {url} since its probability is " f"less than threshold of 0.1:\n{api_item}" ) return None def parse_product( self, response: DummyResponse, product: Product, dynamic: DynamicDeps ) -> Iterable[ Union[Product, Dict[str, Union[Product, Optional[CustomAttributes]]]] ]: if ( item := self._produce_item( product, "product", response.url, dynamic.get(CustomAttributes) ) ) is not None: yield item @staticmethod def get_parse_navigation_request_priority(request: ProbabilityRequest) -> int: if ( not hasattr(request, "metadata") or not request.metadata or request.metadata.probability is None ): return 0 return int(100 * request.metadata.probability) def get_parse_navigation_request( self, request: ProbabilityRequest, callback: Optional[Callable] = None, page_params: Optional[Dict[str, Any]] = None, priority: Optional[int] = None, page_type: str = "productNavigation", ) -> scrapy.Request: callback = callback or self.parse_navigation meta: Dict[str, Any] = { "page_params": page_params or {}, "crawling_logs": { "name": request.name or "", "probability": request.get_probability(), "page_type": page_type, }, } if self.args.extract == EcommerceExtract.productList: meta["inject"] = [ProductList] if self._custom_attrs_dep: meta["inject"].append(self._custom_attrs_dep) return request.to_scrapy( callback=callback, priority=priority or self.get_parse_navigation_request_priority(request), meta=meta, ) def get_subcategory_request( self, request: ProbabilityRequest, callback: Optional[Callable] = None, page_params: Optional[Dict[str, Any]] = None, priority: Optional[int] = None, ) -> scrapy.Request: page_type = "subCategories" request_name = request.name or "" if "[heuristics]" not in request_name: page_params = None else: page_type = "productNavigation-heuristics" request.name = request_name.replace("[heuristics]", "").strip() return self.get_parse_navigation_request( request, callback, page_params, priority, page_type, ) def get_nextpage_request( self, request: ProbabilityRequest, callback: Optional[Callable] = None, page_params: Optional[Dict[str, Any]] = None, ): return self.get_parse_navigation_request( request, callback, page_params, self._NEXT_PAGE_PRIORITY, "nextPage" ) def get_parse_product_request_priority(self, request: ProbabilityRequest) -> int: probability = request.get_probability() or 0 return int(100 * probability) + self._NEXT_PAGE_PRIORITY def get_parse_product_request( self, request: ProbabilityRequest, callback: Optional[Callable] = None ) -> scrapy.Request: callback = callback or self.parse_product priority = self.get_parse_product_request_priority(request) probability = request.get_probability() meta: Dict[str, Any] = { "crawling_logs": { "name": request.name, "probability": probability, "page_type": "product", }, } if self._custom_attrs_dep: meta["inject"] = [ self._custom_attrs_dep, ] scrapy_request = request.to_scrapy( callback=callback, priority=priority, meta=meta, ) scrapy_request.meta["allow_offsite"] = True return scrapy_request def _modify_page_params_for_heuristics( self, page_params: Optional[Dict] ) -> Dict[str, Any]: page_params = page_params or {} # Only allow heuristic extraction of links in non-homepage when on "full" crawl. if self.args.crawl_strategy != EcommerceCrawlStrategy.full: page_params.pop("full_domain", None) return page_params