Source code for zyte_spider_templates.pages.article_heuristics

import json
import logging
from typing import Iterable, List

import attrs
import xtractmime
from scrapy.http import TextResponse
from scrapy.link import Link
from scrapy.linkextractors import LinkExtractor
from web_poet import AnyResponse, HttpResponse, PageParams, Stats, field, handle_urls
from web_poet.utils import cached_method
from zyte_common_items import (
    BaseArticleNavigationPage,
    ProbabilityMetadata,
    ProbabilityRequest,
)

from zyte_spider_templates.feeds import get_feed_urls, parse_feed
from zyte_spider_templates.heuristics import (
    classify_article_crawling_links,
    classify_article_feed_links,
)

from ..heuristics import is_feed_content

logger = logging.getLogger(__name__)


def is_feed_request(request: ProbabilityRequest) -> bool:
    return bool(
        request.name
        and request.name.startswith("[heuristics][articleNavigation][feed]")
    )


[docs] @handle_urls("") @attrs.define class HeuristicsArticleNavigationPage(BaseArticleNavigationPage): response: AnyResponse stats: Stats page_params: PageParams _ARTICLE_HEURISTIC = {"name": "article", "dummy probability": 0.5} _NAVIGATION_HEURISTIC = {"name": "subCategories", "dummy probability": 0.5} _FEED_HEURISTIC = {"name": "feed", "dummy probability": 1.0} _FEED_ITEMS_HEURISTIC = {"name": "feed items", "dummy probability": 0.99} @field def url(self) -> str: return str(self.response.url) @field def subCategories(self) -> Iterable[ProbabilityRequest]: if self._is_response_feed(): return feeds = self._get_feed_links() feed_urls = {link.url for link in feeds} for link in feeds: yield self._get_request(link, self._FEED_HEURISTIC) if self.skip_subcategories() or self.is_only_feeds(): return sub_categories = [ link for link in self._get_article_or_navigation_links() if link.url not in feed_urls ] for link in sub_categories: yield self._get_request(link, self._NAVIGATION_HEURISTIC) @field def items(self) -> Iterable[ProbabilityRequest]: if self._is_response_feed(): links = self._get_feed_items_links() heuristic = self._FEED_ITEMS_HEURISTIC elif not self.is_only_feeds(): links = self._get_article_or_navigation_links() heuristic = self._ARTICLE_HEURISTIC else: return for link in links: yield self._get_request(link, heuristic) @cached_method def _get_article_or_navigation_links(self) -> List[Link]: """Extract links from an HTML web page.""" response = TextResponse( url=str(self.response.url), body=self.response.text.encode() ) link_extractor = LinkExtractor() links = link_extractor.extract_links(response) allowed_links, disallowed_links = classify_article_crawling_links(links) _log_and_stats( self, "heuristic_navigation_or_article", links, allowed_links, disallowed_links, ) return allowed_links @cached_method def _get_feed_items_links(self) -> List[Link]: """Extract links from an RSS/Atom feed.""" links = [Link(url) for url in parse_feed(self.response)] allowed_links, disallowed_links = classify_article_crawling_links(links) _log_and_stats( self, "heuristic_feed_items", links, allowed_links, disallowed_links ) return allowed_links @cached_method def _get_feed_links(self) -> List[Link]: """Extract links to RSS/Atom feeds form an HTML web page.""" links = [Link(url) for url in get_feed_urls(self.response)] allowed_links, disallowed_links = classify_article_feed_links(links) _log_and_stats(self, "heuristic_feed", links, allowed_links, disallowed_links) return allowed_links @cached_method def _is_response_feed(self) -> bool: """Return True if a response is an RSS or Atom feed.""" content_type = "" if isinstance(self.response.response, HttpResponse): content_type = self.response.response.headers.get("Content-Type", "") elif is_feed_content(self.response.response): logger.warning( "It is likely that the spider is using BrowserHtml to extract the RSS feed. " "Please note that using HttpResponse is more efficient." ) return True mime_type = xtractmime.extract_mime( self.response.text.encode(), content_types=(content_type.encode(),), ) return xtractmime.mimegroups.is_xml_mime_type( mime_type ) or xtractmime.mimegroups.is_json_mime_type(mime_type) def _get_request(self, link, heuristic) -> ProbabilityRequest: return ProbabilityRequest( url=link.url, name=f"[heuristics][articleNavigation][{heuristic['name']}] {link.text.strip()}", metadata=ProbabilityMetadata(probability=heuristic["dummy probability"]), ) def skip_subcategories(self) -> bool: return self.page_params.get("skip_subcategories", False) def is_only_feeds(self) -> bool: return self.page_params.get("only_feeds", False)
def _log_and_stats(self, urls_type, links, allowed_links, disallowed_links): _logs(self, urls_type, links, allowed_links, disallowed_links) _stats(self, urls_type, links, allowed_links, disallowed_links) def _stats(page, urls_type, urls, allowed_urls, disallowed_urls): page.stats.inc(f"article_spider/{urls_type}/visited", 1) page.stats.inc(f"article_spider/{urls_type}/no_links", 0 if urls else 1) page.stats.inc(f"article_spider/{urls_type}/with_links", 1 if urls else 0) page.stats.inc(f"article_spider/{urls_type}/links/total", len(urls)) page.stats.inc(f"article_spider/{urls_type}/links/allow", len(allowed_urls)) page.stats.inc(f"article_spider/{urls_type}/links/disallow", len(disallowed_urls)) def _logs(page, urls_type, urls, allowed_urls, disallowed_urls): page_name = page.item_cls.__name__ data = { "page": page_name, "page url": page.url, "urls type": urls_type, "urls found": len(urls), "allowed urls": len(allowed_urls), "urls to skip": len(disallowed_urls), "list of urls to skip": [ url.url if isinstance(url, Link) else url for url in disallowed_urls ], } logger.debug(f"Article Heuristic Logs:\n{json.dumps(data, indent=2)}")