Source code for zyte_spider_templates.spiders.job_posting

from __future__ import annotations

from enum import Enum
from typing import (
    TYPE_CHECKING,
    Any,
    Callable,
    Dict,
    Iterable,
    List,
    Optional,
    Union,
    cast,
)

import scrapy
from pydantic import BaseModel, ConfigDict, Field
from scrapy.crawler import Crawler
from scrapy_poet import DummyResponse, DynamicDeps
from scrapy_spider_metadata import Args
from web_poet import BrowserResponse
from zyte_common_items import (
    CustomAttributes,
    JobPosting,
    JobPostingNavigation,
    ProbabilityRequest,
    SearchRequestTemplate,
)

from zyte_spider_templates.spiders.base import (
    ARG_SETTING_PRIORITY,
    INPUT_GROUP,
    BaseSpider,
)

from ..documentation import document_enum
from ..params import (
    CustomAttrsInputParam,
    CustomAttrsMethodParam,
    ExtractFrom,
    ExtractFromParam,
    GeolocationParam,
    MaxRequestsParam,
    SearchQueriesParam,
    UrlParam,
    UrlsFileParam,
    UrlsParam,
    parse_input_params,
)

if TYPE_CHECKING:
    # typing.Self requires Python 3.11
    from typing_extensions import Self



[docs]
@document_enum
class JobPostingCrawlStrategy(str, Enum):
    navigation: str = "navigation"
    """Follow pagination and job posting detail pages."""

    direct_item: str = "direct_item"
    """Treat input URLs as direct links to job posting detail pages, and extract a
    job posting from each."""




[docs]
class JobPostingCrawlStrategyParam(BaseModel):
    crawl_strategy: JobPostingCrawlStrategy = Field(
        title="Crawl strategy",
        description="Determines how input URLs and follow-up URLs are crawled.",
        default=JobPostingCrawlStrategy.navigation,
        json_schema_extra={
            "enumMeta": {
                JobPostingCrawlStrategy.navigation: {
                    "title": "Navigation",
                    "description": "Follow pagination and job posting detail pages.",
                },
                JobPostingCrawlStrategy.direct_item: {
                    "title": "Direct URLs to job postings",
                    "description": (
                        "Treat input URLs as direct links to job posting detail pages, and "
                        "extract a job posting from each."
                    ),
                },
            },
        },
    )



class JobPostingSearchQueriesParam(SearchQueriesParam):
    search_queries: List[str] = Field(
        title="Search Queries",
        description=(
            "A list of search queries, one per line, to submit using the "
            "search form found on each input URL. Only works for input URLs "
            "that support search. May not work on every website."
        ),
        default_factory=list,
        json_schema_extra={
            "default": [],
            "widget": "textarea",
        },
    )



[docs]
class JobPostingSpiderParams(
    CustomAttrsMethodParam,
    CustomAttrsInputParam,
    ExtractFromParam,
    MaxRequestsParam,
    GeolocationParam,
    JobPostingCrawlStrategyParam,
    JobPostingSearchQueriesParam,
    UrlsFileParam,
    UrlsParam,
    UrlParam,
    BaseModel,
):
    model_config = ConfigDict(
        json_schema_extra={
            "groups": [
                INPUT_GROUP,
            ],
        },
    )




[docs]
class JobPostingSpider(Args[JobPostingSpiderParams], BaseSpider):
    """Yield job postings from a job website.

    See :class:`~zyte_spider_templates.spiders.job_posting.JobPostingSpiderParams`
    for supported parameters.

    .. seealso:: :ref:`job-posting`.
    """

    name = "job_posting"

    metadata: Dict[str, Any] = {
        **BaseSpider.metadata,
        "title": "Job posting",
        "description": "[Experimental] Template for spiders that extract job posting data from websites.",
    }

    @classmethod
    def from_crawler(cls, crawler: Crawler, *args, **kwargs) -> Self:
        spider = super().from_crawler(crawler, *args, **kwargs)
        parse_input_params(spider)
        spider._init_extract_from()
        return spider

    def _init_extract_from(self):
        if self.args.extract_from is not None:
            self.settings.set(
                "ZYTE_API_PROVIDER_PARAMS",
                {
                    "jobPostingOptions": {"extractFrom": self.args.extract_from},
                    "jobPostingNavigationOptions": {
                        "extractFrom": self.args.extract_from
                    },
                    **self.settings.get("ZYTE_API_PROVIDER_PARAMS", {}),
                },
                priority=ARG_SETTING_PRIORITY,
            )

    def get_start_request(self, url):
        callback = (
            self.parse_job_posting
            if self.args.crawl_strategy == JobPostingCrawlStrategy.direct_item
            else self.parse_navigation
        )
        meta: Dict[str, Any] = {
            "crawling_logs": {
                "page_type": "jobPosting"
                if self.args.crawl_strategy == JobPostingCrawlStrategy.direct_item
                else "jobPostingNavigation"
            },
        }
        if (
            self.args.crawl_strategy == JobPostingCrawlStrategy.direct_item
            and self._custom_attrs_dep
        ):
            meta["inject"] = [
                self._custom_attrs_dep,
            ]
        return scrapy.Request(
            url=url,
            callback=callback,
            meta=meta,
        )

    def start_requests(self) -> Iterable[scrapy.Request]:
        if self.args.search_queries:
            for url in self.start_urls:
                meta: Dict[str, Any] = {
                    "crawling_logs": {"page_type": "searchRequestTemplate"},
                }
                if self.args.extract_from == ExtractFrom.browserHtml:
                    meta["inject"] = [BrowserResponse]
                with self._log_request_exception:
                    yield scrapy.Request(
                        url=url,
                        callback=self.parse_search_request_template,
                        meta=meta,
                    )
        else:
            for url in self.start_urls:
                with self._log_request_exception:
                    yield self.get_start_request(url)

    def parse_search_request_template(
        self,
        response: DummyResponse,
        search_request_template: SearchRequestTemplate,
        dynamic: DynamicDeps,
    ) -> Iterable[scrapy.Request]:
        probability = search_request_template.get_probability()
        if probability is not None and probability <= 0:
            return
        for query in self.args.search_queries:
            meta: Dict[str, Any] = {
                "crawling_logs": {"page_type": "jobPostingNavigation"},
            }
            with self._log_request_exception:
                yield search_request_template.request(query=query).to_scrapy(
                    callback=self.parse_navigation,
                    meta=meta,
                )

    def parse_navigation(
        self, response: DummyResponse, navigation: JobPostingNavigation
    ) -> Iterable[scrapy.Request]:
        job_postings = navigation.items or []
        for request in job_postings:
            with self._log_request_exception:
                yield self.get_parse_job_posting_request(request)

        if navigation.nextPage:
            if not job_postings:
                self.logger.info(
                    f"Ignoring nextPage link {navigation.nextPage} since there "
                    f"are no job posting links found in {navigation.url}"
                )
            else:
                with self._log_request_exception:
                    yield self.get_nextpage_request(
                        cast(ProbabilityRequest, navigation.nextPage)
                    )

    def parse_job_posting(
        self, response: DummyResponse, job_posting: JobPosting, dynamic: DynamicDeps
    ) -> Iterable[
        Union[JobPosting, Dict[str, Union[JobPosting, Optional[CustomAttributes]]]]
    ]:
        probability = job_posting.get_probability()

        # TODO: convert to a configurable parameter later on after the launch
        if probability is None or probability >= 0.1:
            if self.args.custom_attrs_input:
                yield {
                    "jobPosting": job_posting,
                    "customAttributes": dynamic.get(CustomAttributes),
                }
            else:
                yield job_posting
        else:
            assert self.crawler.stats
            self.crawler.stats.inc_value("drop_item/job_posting/low_probability")
            self.logger.info(
                f"Ignoring item from {response.url} since its probability is "
                f"less than threshold of 0.1:\n{job_posting}"
            )

    def get_parse_navigation_request(
        self,
        request: ProbabilityRequest,
        callback: Optional[Callable] = None,
        page_params: Optional[Dict[str, Any]] = None,
        page_type: str = "jobPostingNavigation",
    ) -> scrapy.Request:
        callback = callback or self.parse_navigation

        return request.to_scrapy(
            callback=callback,
            meta={
                "page_params": page_params or {},
                "crawling_logs": {
                    "name": request.name or "",
                    "probability": request.get_probability(),
                    "page_type": page_type,
                },
            },
        )

    def get_nextpage_request(
        self,
        request: ProbabilityRequest,
        callback: Optional[Callable] = None,
        page_params: Optional[Dict[str, Any]] = None,
    ):
        return self.get_parse_navigation_request(
            request, callback, page_params, "nextPage"
        )

    def get_parse_job_posting_request(
        self, request: ProbabilityRequest, callback: Optional[Callable] = None
    ) -> scrapy.Request:
        callback = callback or self.parse_job_posting

        probability = request.get_probability()
        meta: Dict[str, Any] = {
            "crawling_logs": {
                "name": request.name,
                "probability": probability,
                "page_type": "jobPosting",
            },
        }
        if self._custom_attrs_dep:
            meta["inject"] = [
                self._custom_attrs_dep,
            ]

        scrapy_request = request.to_scrapy(
            callback=callback,
            meta=meta,
        )
        scrapy_request.meta["allow_offsite"] = True
        return scrapy_request