Source code for zyte_spider_templates.spiders.base

from __future__ import annotations

from importlib.metadata import version
from typing import TYPE_CHECKING, Annotated, Any, Dict
from warnings import warn

import scrapy
from pydantic import BaseModel, ConfigDict, model_validator
from scrapy.crawler import Crawler
from scrapy_zyte_api import custom_attrs
from zyte_common_items import CustomAttributes

from ..params import (
    INPUT_GROUP,
    ExtractFromParam,
    GeolocationParam,
    MaxRequestsParam,
    SearchQueriesParam,
    UrlParam,
    UrlsFileParam,
    UrlsParam,
)

if TYPE_CHECKING:
    # typing.Self requires Python 3.11
    from typing_extensions import Self


class _LogExceptionsContextManager:
    def __init__(self, spider, exc_info):
        self._spider = spider
        self._exc_info = exc_info

    def __enter__(self):
        return

    def __exit__(self, exc_type, exc_value, exc_traceback):
        if exc_type is None:
            return True
        if issubclass(exc_type, self._exc_info):
            self._spider.logger.exception(exc_value)
            return True
        return False


# Higher priority than command-line-defined settings (40).
ARG_SETTING_PRIORITY: int = 50


class BaseSpiderParams(
    ExtractFromParam,
    MaxRequestsParam,
    GeolocationParam,
    SearchQueriesParam,
    UrlsFileParam,
    UrlsParam,
    UrlParam,
    BaseModel,
):
    model_config = ConfigDict(
        json_schema_extra={
            "groups": [
                INPUT_GROUP,
            ],
        },
    )

    @model_validator(mode="after")
    def deprecated(self):
        warn(
            (
                "BaseSpiderParams is deprecated, use pydantic.BaseModel and "
                "your desired combination of classes from "
                "zyte_spider_templates.params instead."
            ),
            DeprecationWarning,
        )
        return self


[docs] class BaseSpider(scrapy.Spider): custom_settings: Dict[str, Any] = { # type: ignore[assignment] "ZYTE_API_TRANSPARENT_MODE": True, "_ZYTE_API_USER_AGENT": f"zyte-spider-templates/{version('zyte-spider-templates')}", } metadata: Dict[str, Any] = { "template": True, "title": "Base", "description": "Base template.", } _NEXT_PAGE_PRIORITY: int = 100 _custom_attrs_dep = None _log_request_exception: _LogExceptionsContextManager = None # type: ignore[assignment] @classmethod def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> Self: spider = super().from_crawler(crawler, *args, **kwargs) # all subclasses of this need to also have Args as a subclass # this may be possible to express in type hints instead assert hasattr(spider, "args") if geolocation := getattr(spider.args, "geolocation", None): # We set the geolocation in ZYTE_API_PROVIDER_PARAMS for injected # dependencies, and in ZYTE_API_AUTOMAP_PARAMS for page object # additional requests. for component in ("AUTOMAP", "PROVIDER"): default_params = spider.settings.getdict(f"ZYTE_API_{component}_PARAMS") default_params["geolocation"] = geolocation spider.settings.set( f"ZYTE_API_{component}_PARAMS", default_params, priority=ARG_SETTING_PRIORITY, ) if spider.args.max_requests: spider.settings.set( "ZYTE_API_MAX_REQUESTS", spider.args.max_requests, priority=ARG_SETTING_PRIORITY, ) if custom_attrs_input := getattr(spider.args, "custom_attrs_input", None): custom_attrs_options = { "method": spider.args.custom_attrs_method, } if max_input_tokens := crawler.settings.getint("ZYTE_API_MAX_INPUT_TOKENS"): custom_attrs_options["maxInputTokens"] = max_input_tokens if max_output_tokens := crawler.settings.getint( "ZYTE_API_MAX_OUTPUT_TOKENS" ): custom_attrs_options["maxOutputTokens"] = max_output_tokens spider._custom_attrs_dep = Annotated[ CustomAttributes, custom_attrs(custom_attrs_input, custom_attrs_options), ] spider._log_request_exception = _LogExceptionsContextManager(spider, ValueError) return spider