Source code for zyte_spider_templates.pages.article_heuristics

import json
import logging
from typing import Iterable, List

import attrs
import xtractmime
from scrapy.http import TextResponse
from scrapy.link import Link
from scrapy.linkextractors import LinkExtractor
from web_poet import AnyResponse, HttpResponse, PageParams, Stats, field, handle_urls
from web_poet.utils import cached_method
from zyte_common_items import (
    BaseArticleNavigationPage,
    ProbabilityMetadata,
    ProbabilityRequest,
)

from zyte_spider_templates.feeds import get_feed_urls, parse_feed
from zyte_spider_templates.heuristics import (
    classify_article_crawling_links,
    classify_article_feed_links,
)

from ..heuristics import is_feed_content

logger = logging.getLogger(__name__)


def is_feed_request(request: ProbabilityRequest) -> bool:
    return bool(
        request.name
        and request.name.startswith("[heuristics][articleNavigation][feed]")
    )



[docs]
@handle_urls("")
@attrs.define
class HeuristicsArticleNavigationPage(BaseArticleNavigationPage):
    response: AnyResponse
    stats: Stats
    page_params: PageParams
    _ARTICLE_HEURISTIC = {"name": "article", "dummy probability": 0.5}
    _NAVIGATION_HEURISTIC = {"name": "subCategories", "dummy probability": 0.5}
    _FEED_HEURISTIC = {"name": "feed", "dummy probability": 1.0}
    _FEED_ITEMS_HEURISTIC = {"name": "feed items", "dummy probability": 0.99}

    @field
    def url(self) -> str:
        return str(self.response.url)

    @field
    def subCategories(self) -> Iterable[ProbabilityRequest]:
        if self._is_response_feed():
            return

        feeds = self._get_feed_links()
        feed_urls = {link.url for link in feeds}
        for link in feeds:
            yield self._get_request(link, self._FEED_HEURISTIC)

        if self.skip_subcategories() or self.is_only_feeds():
            return

        sub_categories = [
            link
            for link in self._get_article_or_navigation_links()
            if link.url not in feed_urls
        ]
        for link in sub_categories:
            yield self._get_request(link, self._NAVIGATION_HEURISTIC)

    @field
    def items(self) -> Iterable[ProbabilityRequest]:
        if self._is_response_feed():
            links = self._get_feed_items_links()
            heuristic = self._FEED_ITEMS_HEURISTIC
        elif not self.is_only_feeds():
            links = self._get_article_or_navigation_links()
            heuristic = self._ARTICLE_HEURISTIC
        else:
            return

        for link in links:
            yield self._get_request(link, heuristic)

    @cached_method
    def _get_article_or_navigation_links(self) -> List[Link]:
        """Extract links from an HTML web page."""
        response = TextResponse(
            url=str(self.response.url), body=self.response.text.encode()
        )
        link_extractor = LinkExtractor()
        links = link_extractor.extract_links(response)
        allowed_links, disallowed_links = classify_article_crawling_links(links)

        _log_and_stats(
            self,
            "heuristic_navigation_or_article",
            links,
            allowed_links,
            disallowed_links,
        )
        return allowed_links

    @cached_method
    def _get_feed_items_links(self) -> List[Link]:
        """Extract links from an RSS/Atom feed."""
        links = [Link(url) for url in parse_feed(self.response)]
        allowed_links, disallowed_links = classify_article_crawling_links(links)

        _log_and_stats(
            self, "heuristic_feed_items", links, allowed_links, disallowed_links
        )
        return allowed_links

    @cached_method
    def _get_feed_links(self) -> List[Link]:
        """Extract links to RSS/Atom feeds form an HTML web page."""
        links = [Link(url) for url in get_feed_urls(self.response)]
        allowed_links, disallowed_links = classify_article_feed_links(links)

        _log_and_stats(self, "heuristic_feed", links, allowed_links, disallowed_links)
        return allowed_links

    @cached_method
    def _is_response_feed(self) -> bool:
        """Return True if a response is an RSS or Atom feed."""

        content_type = ""
        if isinstance(self.response.response, HttpResponse):
            content_type = self.response.response.headers.get("Content-Type", "")
        elif is_feed_content(self.response.response):
            logger.warning(
                "It is likely that the spider is using BrowserHtml to extract the RSS feed. "
                "Please note that using HttpResponse is more efficient."
            )
            return True

        mime_type = xtractmime.extract_mime(
            self.response.text.encode(),
            content_types=(content_type.encode(),),
        )

        return xtractmime.mimegroups.is_xml_mime_type(
            mime_type
        ) or xtractmime.mimegroups.is_json_mime_type(mime_type)

    def _get_request(self, link, heuristic) -> ProbabilityRequest:
        return ProbabilityRequest(
            url=link.url,
            name=f"[heuristics][articleNavigation][{heuristic['name']}] {link.text.strip()}",
            metadata=ProbabilityMetadata(probability=heuristic["dummy probability"]),
        )

    def skip_subcategories(self) -> bool:
        return self.page_params.get("skip_subcategories", False)

    def is_only_feeds(self) -> bool:
        return self.page_params.get("only_feeds", False)



def _log_and_stats(self, urls_type, links, allowed_links, disallowed_links):
    _logs(self, urls_type, links, allowed_links, disallowed_links)
    _stats(self, urls_type, links, allowed_links, disallowed_links)


def _stats(page, urls_type, urls, allowed_urls, disallowed_urls):
    page.stats.inc(f"article_spider/{urls_type}/visited", 1)
    page.stats.inc(f"article_spider/{urls_type}/no_links", 0 if urls else 1)
    page.stats.inc(f"article_spider/{urls_type}/with_links", 1 if urls else 0)
    page.stats.inc(f"article_spider/{urls_type}/links/total", len(urls))
    page.stats.inc(f"article_spider/{urls_type}/links/allow", len(allowed_urls))
    page.stats.inc(f"article_spider/{urls_type}/links/disallow", len(disallowed_urls))


def _logs(page, urls_type, urls, allowed_urls, disallowed_urls):
    page_name = page.item_cls.__name__
    data = {
        "page": page_name,
        "page url": page.url,
        "urls type": urls_type,
        "urls found": len(urls),
        "allowed urls": len(allowed_urls),
        "urls to skip": len(disallowed_urls),
        "list of urls to skip": [
            url.url if isinstance(url, Link) else url for url in disallowed_urls
        ],
    }
    logger.debug(f"Article Heuristic Logs:\n{json.dumps(data, indent=2)}")