Source code for zyte_spider_templates.spiders.serp

from enum import Enum
from typing import Any, Dict, Iterable, List, Optional, Union

from pydantic import BaseModel, Field, field_validator
from scrapy import Request
from scrapy.settings import SETTINGS_PRIORITIES, BaseSettings
from scrapy_poet import DummyResponse, DynamicDeps
from scrapy_spider_metadata import Args
from w3lib.url import add_or_replace_parameter
from zyte_common_items import (
    Article,
    ArticleList,
    ForumThread,
    JobPosting,
    Product,
    ProductList,
    Serp,
)

from .._geolocations import GEOLOCATION_OPTIONS_WITH_CODE, Geolocation
from ..documentation import document_enum
from ._google_domains import GoogleDomain
from ._google_gl import GOOGLE_GL_OPTIONS_WITH_CODE, GoogleGl
from ._google_hl import GOOGLE_HL_OPTIONS_WITH_CODE, GoogleHl
from .base import BaseSpider


class GoogleCrParam(BaseModel):
    cr: Optional[str] = Field(
        title="Content Countries (cr)",
        description=(
            "Restricts search results to documents originating in "
            "particular countries. See "
            "https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list#body.QUERY_PARAMETERS.cr"
        ),
        default=None,
    )


class GoogleGlParam(BaseModel):
    gl: Optional[GoogleGl] = Field(
        title="User Country (gl)",
        description=(
            "Boosts results relevant to this country. See "
            "https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list#body.QUERY_PARAMETERS.gl"
        ),
        default=None,
        json_schema_extra={
            "enumMeta": {
                code: {
                    "title": GOOGLE_GL_OPTIONS_WITH_CODE[code],
                }
                for code in GoogleGl
            }
        },
    )


class GoogleHlParam(BaseModel):
    hl: Optional[GoogleHl] = Field(
        title="User Language (hl)",
        description=(
            "User interface language, which can affect search results. See "
            "https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list#body.QUERY_PARAMETERS.hl"
        ),
        default=None,
        json_schema_extra={
            "enumMeta": {
                code: {
                    "title": GOOGLE_HL_OPTIONS_WITH_CODE[code],
                }
                for code in GoogleHl
            }
        },
    )


class GoogleLrParam(BaseModel):
    lr: Optional[str] = Field(
        title="Content Languages (lr)",
        description=(
            "Restricts search results to documents written in the specified "
            "languages. See "
            "https://developers.google.com/custom-search/v1/reference/rest/v1/cse/list#body.QUERY_PARAMETERS.lr"
        ),
        default=None,
    )


class SearchQueriesParam(BaseModel):
    search_queries: Optional[List[str]] = Field(
        title="Search Queries",
        description="Input 1 search query per line (e.g. foo bar).",
        json_schema_extra={
            "widget": "textarea",
            "pattern": r"(.|\r?\n)*\S+(.|\r?\n)*",
        },
    )

    @field_validator("search_queries", mode="before")
    @classmethod
    def validate_search_queries(cls, value: Union[List[str], str]) -> List[str]:
        """Validate a list of search queries.
        If a string is received as input, it is split into multiple strings
        on new lines.
        """
        if isinstance(value, str):
            value = value.split("\n")
        result = []
        for v in value:
            if v := v.strip():
                result.append(v)
        if not result:
            raise ValueError("The search_queries parameter value is missing or empty.")
        return result


class SerpGeolocationParam(BaseModel):
    # We use “geolocation” as parameter name (instead of e.g. “ip_geolocation”)
    # to reuse the implementation in BaseSpider.
    geolocation: Optional[Geolocation] = Field(
        # The title, worded like this for contrast with gl, is the reason why
        # ..params.GeolocationParam is not used.
        title="IP Country",
        description="Country of the IP addresses to use.",
        default=None,
        json_schema_extra={
            "enumMeta": {
                code: {
                    "title": GEOLOCATION_OPTIONS_WITH_CODE[code],
                }
                for code in Geolocation
            }
        },
    )


[docs] class SerpMaxPagesParam(BaseModel): max_pages: int = Field( title="Max Pages", description="Maximum number of result pages to visit per search query.", ge=1, default=1, )
# MaxRequestsParam without the widget. class SerpMaxRequestsParam(BaseModel): max_requests: Optional[int] = Field( description=( "The maximum number of Zyte API requests allowed for the crawl.\n" "\n" "Requests with error responses that cannot be retried or exceed " "their retry limit also count here, but they incur in no costs " "and do not increase the request count in Scrapy Cloud." ), default=100, ge=1, ) class SerpResultsPerPageParam(BaseModel): results_per_page: Optional[int] = Field( title="Results Per Page", description="Maximum number of results per page.", ge=1, default=None, )
[docs] @document_enum class SerpItemType(str, Enum): off: str = "off" """ Do not follow result links. """ article: str = "article" """ Article data. """ articleList: str = "articleList" """ Article list data. """ forumThread: str = "forumThread" """ Forum thread data. """ jobPosting: str = "jobPosting" """ Job posting data. """ product: str = "product" """ Product data. """ productList: str = "productList" """ Product list data. """
ITEM_TYPE_CLASSES = { SerpItemType.article: Article, SerpItemType.articleList: ArticleList, SerpItemType.forumThread: ForumThread, SerpItemType.jobPosting: JobPosting, SerpItemType.product: Product, SerpItemType.productList: ProductList, }
[docs] class SerpItemTypeParam(BaseModel): item_type: SerpItemType = Field( title="Follow and Extract", description=( "If specified, follow organic search result links, and extract " "the selected data type from the target pages. Spider output " "items will be of the specified data type, not search engine " "results page items." ), default=SerpItemType.off, )
class GoogleDomainParam(BaseModel): domain: GoogleDomain = Field( title="Domain", description="Target Google domain.", default=GoogleDomain.google_com, )
[docs] class GoogleSearchSpiderParams( GoogleLrParam, GoogleHlParam, SerpGeolocationParam, GoogleCrParam, GoogleGlParam, SerpItemTypeParam, SerpResultsPerPageParam, SerpMaxPagesParam, SerpMaxRequestsParam, SearchQueriesParam, GoogleDomainParam, BaseModel, ): pass
[docs] class GoogleSearchSpider(Args[GoogleSearchSpiderParams], BaseSpider): """Yield results from Google searches. See :class:`~zyte_spider_templates.spiders.serp.GoogleSearchSpiderParams` for supported parameters. .. seealso:: :ref:`google-search`. """ name = "google_search" _default_results_per_page = 10 metadata: Dict[str, Any] = { **BaseSpider.metadata, "title": "Google Search Results", "description": "Template for spiders that extract Google search results.", } @classmethod def update_settings(cls, settings: BaseSettings) -> None: super().update_settings(settings) retry_policy_setting_priority = settings.getpriority("ZYTE_API_RETRY_POLICY") if ( retry_policy_setting_priority is None or retry_policy_setting_priority < SETTINGS_PRIORITIES["spider"] ): settings.set( "ZYTE_API_RETRY_POLICY", "zyte_api.aggressive_retrying", priority="spider", ) def get_serp_request(self, url: str, *, page_number: int): for argument, parameter in ( (self.args.cr, "cr"), (self.args.gl, "gl"), (self.args.hl, "hl"), (self.args.lr, "lr"), (self.args.results_per_page, "num"), ): if not argument: continue if isinstance(argument, Enum): argument = argument.value if not isinstance(argument, str): argument = str(argument) url = add_or_replace_parameter(url, parameter, argument) return Request( url=url, callback=self.parse_serp, cb_kwargs={ "page_number": page_number, }, meta={ "crawling_logs": {"page_type": "serp"}, "zyte_api": { "serp": True, }, }, ) def start_requests(self) -> Iterable[Request]: search_queries = self.args.search_queries if not search_queries: raise ValueError("No search queries specified.") url = f"https://www.{self.args.domain.value}/search" for search_query in search_queries: search_url = add_or_replace_parameter(url, "q", search_query) with self._log_request_exception: yield self.get_serp_request(search_url, page_number=1) def parse_serp(self, response, page_number) -> Iterable[Union[Request, Serp]]: serp = Serp.from_dict(response.raw_api_response["serp"]) if page_number < self.args.max_pages: next_start = page_number * ( self.args.results_per_page or self._default_results_per_page ) if serp.organicResults and ( serp.metadata.totalOrganicResults is None or serp.metadata.totalOrganicResults > next_start ): next_url = add_or_replace_parameter(serp.url, "start", str(next_start)) with self._log_request_exception: yield self.get_serp_request(next_url, page_number=page_number + 1) if self.args.item_type == SerpItemType.off: yield serp return for result in serp.organicResults: with self._log_request_exception: yield response.follow( result.url, callback=self.parse_result, meta={ "crawling_logs": {"page_type": self.args.item_type.value}, "inject": [ITEM_TYPE_CLASSES[self.args.item_type]], }, ) def parse_result( self, response: DummyResponse, dynamic: DynamicDeps ) -> Iterable[Any]: yield next(iter(dynamic.values()))