Source code for zyte_spider_templates.spiders.serp

from enum import Enum
from typing import Any, Dict, Iterable, List, Optional, Union

from pydantic import BaseModel, Field, field_validator
from scrapy import Request
from scrapy.settings import SETTINGS_PRIORITIES, BaseSettings
from scrapy_poet import DummyResponse, DynamicDeps
from scrapy_spider_metadata import Args
from w3lib.url import add_or_replace_parameter
from zyte_common_items import (
    Article,
    ArticleList,
    ForumThread,
    JobPosting,
    Product,
    ProductList,
    Serp,
)

from .._geolocations import GEOLOCATION_OPTIONS_WITH_CODE, Geolocation
from ..documentation import document_enum
from ._google_domains import GoogleDomain
from ._google_gl import GOOGLE_GL_OPTIONS_WITH_CODE, GoogleGl
from ._google_hl import GOOGLE_HL_OPTIONS_WITH_CODE, GoogleHl
from .base import BaseSpider


class GoogleCrParam(BaseModel):
    cr: Optional[str] = Field(
        title="Content Countries (cr)",
        description=(
            "Restricts search results to documents originating in "
            "particular countries. See "
            "https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list#body.QUERY_PARAMETERS.cr"
        ),
        default=None,
    )


class GoogleGlParam(BaseModel):
    gl: Optional[GoogleGl] = Field(
        title="User Country (gl)",
        description=(
            "Boosts results relevant to this country. See "
            "https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list#body.QUERY_PARAMETERS.gl"
        ),
        default=None,
        json_schema_extra={
            "enumMeta": {
                code: {
                    "title": GOOGLE_GL_OPTIONS_WITH_CODE[code],
                }
                for code in GoogleGl
            }
        },
    )


class GoogleHlParam(BaseModel):
    hl: Optional[GoogleHl] = Field(
        title="User Language (hl)",
        description=(
            "User interface language, which can affect search results. See "
            "https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list#body.QUERY_PARAMETERS.hl"
        ),
        default=None,
        json_schema_extra={
            "enumMeta": {
                code: {
                    "title": GOOGLE_HL_OPTIONS_WITH_CODE[code],
                }
                for code in GoogleHl
            }
        },
    )


class GoogleLrParam(BaseModel):
    lr: Optional[str] = Field(
        title="Content Languages (lr)",
        description=(
            "Restricts search results to documents written in the specified "
            "languages. See "
            "https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list#body.QUERY_PARAMETERS.lr"
        ),
        default=None,
    )


class SearchQueriesParam(BaseModel):
    search_queries: Optional[List[str]] = Field(
        title="Search Queries",
        description="Input 1 search query per line (e.g. foo bar).",
        json_schema_extra={
            "widget": "textarea",
            "pattern": r"(.|\r?\n)*\S+(.|\r?\n)*",
        },
    )

    @field_validator("search_queries", mode="before")
    @classmethod
    def validate_search_queries(cls, value: Union[List[str], str]) -> List[str]:
        """Validate a list of search queries.
        If a string is received as input, it is split into multiple strings
        on new lines.
        """
        if isinstance(value, str):
            value = value.split("\n")
        result = []
        for v in value:
            if v := v.strip():
                result.append(v)
        if not result:
            raise ValueError("The search_queries parameter value is missing or empty.")
        return result


class SerpGeolocationParam(BaseModel):
    # We use “geolocation” as parameter name (instead of e.g. “ip_geolocation”)
    # to reuse the implementation in BaseSpider.
    geolocation: Optional[Geolocation] = Field(
        # The title, worded like this for contrast with gl, is the reason why
        # ..params.GeolocationParam is not used.
        title="IP Country",
        description="Country of the IP addresses to use.",
        default=None,
        json_schema_extra={
            "enumMeta": {
                code: {
                    "title": GEOLOCATION_OPTIONS_WITH_CODE[code],
                }
                for code in Geolocation
            }
        },
    )



[docs]
class SerpMaxPagesParam(BaseModel):
    max_pages: int = Field(
        title="Max Pages",
        description="Maximum number of result pages to visit per search query.",
        ge=1,
        default=1,
    )



# MaxRequestsParam without the widget.
class SerpMaxRequestsParam(BaseModel):
    max_requests: Optional[int] = Field(
        description=(
            "The maximum number of Zyte API requests allowed for the crawl.\n"
            "\n"
            "Requests with error responses that cannot be retried or exceed "
            "their retry limit also count here, but they incur in no costs "
            "and do not increase the request count in Scrapy Cloud."
        ),
        default=100,
        ge=1,
    )


class SerpResultsPerPageParam(BaseModel):
    results_per_page: Optional[int] = Field(
        title="Results Per Page",
        description="Maximum number of results per page.",
        ge=1,
        default=None,
    )



[docs]
@document_enum
class SerpItemType(str, Enum):
    off: str = "off"
    """
    Do not follow result links.
    """

    article: str = "article"
    """
    Article data.
    """

    articleList: str = "articleList"
    """
    Article list data.
    """

    forumThread: str = "forumThread"
    """
    Forum thread data.
    """

    jobPosting: str = "jobPosting"
    """
    Job posting data.
    """

    product: str = "product"
    """
    Product data.
    """

    productList: str = "productList"
    """
    Product list data.
    """



ITEM_TYPE_CLASSES = {
    SerpItemType.article: Article,
    SerpItemType.articleList: ArticleList,
    SerpItemType.forumThread: ForumThread,
    SerpItemType.jobPosting: JobPosting,
    SerpItemType.product: Product,
    SerpItemType.productList: ProductList,
}



[docs]
class SerpItemTypeParam(BaseModel):
    item_type: SerpItemType = Field(
        title="Follow and Extract",
        description=(
            "If specified, follow organic search result links, and extract "
            "the selected data type from the target pages. Spider output "
            "items will be of the specified data type, not search engine "
            "results page items."
        ),
        default=SerpItemType.off,
    )



class GoogleDomainParam(BaseModel):
    domain: GoogleDomain = Field(
        title="Domain",
        description="Target Google domain.",
        default=GoogleDomain.google_com,
    )



[docs]
class GoogleSearchSpiderParams(
    GoogleLrParam,
    GoogleHlParam,
    SerpGeolocationParam,
    GoogleCrParam,
    GoogleGlParam,
    SerpItemTypeParam,
    SerpResultsPerPageParam,
    SerpMaxPagesParam,
    SerpMaxRequestsParam,
    SearchQueriesParam,
    GoogleDomainParam,
    BaseModel,
):
    pass




[docs]
class GoogleSearchSpider(Args[GoogleSearchSpiderParams], BaseSpider):
    """Yield results from Google searches.

    See :class:`~zyte_spider_templates.spiders.serp.GoogleSearchSpiderParams`
    for supported parameters.

    .. seealso:: :ref:`google-search`.
    """

    name = "google_search"
    _default_results_per_page = 10

    metadata: Dict[str, Any] = {
        **BaseSpider.metadata,
        "title": "Google Search Results",
        "description": "Template for spiders that extract Google search results.",
    }

    @classmethod
    def update_settings(cls, settings: BaseSettings) -> None:
        super().update_settings(settings)
        retry_policy_setting_priority = settings.getpriority("ZYTE_API_RETRY_POLICY")
        if (
            retry_policy_setting_priority is None
            or retry_policy_setting_priority < SETTINGS_PRIORITIES["spider"]
        ):
            settings.set(
                "ZYTE_API_RETRY_POLICY",
                "zyte_api.aggressive_retrying",
                priority="spider",
            )

    def get_serp_request(self, url: str, *, page_number: int):
        for argument, parameter in (
            (self.args.cr, "cr"),
            (self.args.gl, "gl"),
            (self.args.hl, "hl"),
            (self.args.lr, "lr"),
            (self.args.results_per_page, "num"),
        ):
            if not argument:
                continue
            if isinstance(argument, Enum):
                argument = argument.value
            if not isinstance(argument, str):
                argument = str(argument)
            url = add_or_replace_parameter(url, parameter, argument)
        return Request(
            url=url,
            callback=self.parse_serp,
            cb_kwargs={
                "page_number": page_number,
            },
            meta={
                "crawling_logs": {"page_type": "serp"},
                "zyte_api": {
                    "serp": True,
                },
            },
        )

    def start_requests(self) -> Iterable[Request]:
        search_queries = self.args.search_queries
        if not search_queries:
            raise ValueError("No search queries specified.")

        url = f"https://www.{self.args.domain.value}/search"
        for search_query in search_queries:
            search_url = add_or_replace_parameter(url, "q", search_query)
            with self._log_request_exception:
                yield self.get_serp_request(search_url, page_number=1)

    def parse_serp(self, response, page_number) -> Iterable[Union[Request, Serp]]:
        serp = Serp.from_dict(response.raw_api_response["serp"])

        if page_number < self.args.max_pages:
            next_start = page_number * (
                self.args.results_per_page or self._default_results_per_page
            )
            if serp.organicResults and (
                serp.metadata.totalOrganicResults is None
                or serp.metadata.totalOrganicResults > next_start
            ):
                next_url = add_or_replace_parameter(serp.url, "start", str(next_start))
                with self._log_request_exception:
                    yield self.get_serp_request(next_url, page_number=page_number + 1)

        if self.args.item_type == SerpItemType.off:
            yield serp
            return

        for result in serp.organicResults:
            with self._log_request_exception:
                yield response.follow(
                    result.url,
                    callback=self.parse_result,
                    meta={
                        "crawling_logs": {"page_type": self.args.item_type.value},
                        "inject": [ITEM_TYPE_CLASSES[self.args.item_type]],
                    },
                )

    def parse_result(
        self, response: DummyResponse, dynamic: DynamicDeps
    ) -> Iterable[Any]:
        yield next(iter(dynamic.values()))